This document includes the analysis of simple and complex satellite abundances in different Black/6 and Black/10 mouse strains, and uses the written history (generations at splits along the “phylogeny”) to calculate rates of copy number change.

First, load in and process the data

data <- read.table("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_MA_30x_trimmed_data_kseek.rep.normGC", header=T)
#data[1:5, 1:5]
nrow(data)
## [1] 13749
#colnames(data)

#need to flip the data frame since it's in the opposite orientation
data2 <- data.frame(t(data[-1]))
# match the file names to the names of the lines  
#data2[, 1:5]
colnames(data2) <- data[,1]

line_names <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_line_names.csv", header=F)
line_files <- as.vector(line_names[,1])
line_names <- as.vector(line_names[,2])
names(line_names) <- line_files
line_names_df <- as.data.frame(line_names)

data_withnames <- merge (data2, line_names, by=0)
lines <- as.vector(data_withnames[,"y"])
data_withnames[,"y"] <- NULL
data_withnames[,1] <- NULL
rownames (data_withnames) <- lines
#data_withnames[,1:3]
#data2[,1:5]

# simplify the kmer names 
kmer.labels <- sapply(strsplit(colnames(data_withnames), "\\/"), '[', 1)
colnames(data_withnames) <- kmer.labels

# now calculate the means and sort the data
kmer_means <- colMeans(data_withnames)
data_sorted <- data_withnames[,order(kmer_means, decreasing=T)]  

Next, select the kmers to focus on based on abundance, and plot PCAs

# first use common kmers, which have an average abundance of at least 100
get_common_kmers <- function (kmer_matrix) {
  common.kmer.indexes <- c()
  for (i in 1:ncol(kmer_matrix)) {
    if (mean(kmer_matrix[,i])>=100) {
      common.kmer.indexes <- c(common.kmer.indexes, i)
    }
  }
  names(common.kmer.indexes) <- colnames(kmer_matrix)[common.kmer.indexes]
  return (common.kmer.indexes)
}
data_sorted_common <- data_sorted[,get_common_kmers(data_sorted)]
#data_sorted_common[,1:5]
ncol(data_sorted_common)
## [1] 63
#write.csv(data_sorted_common, "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_common_kmer_abundances.csv")

AG_repeats <- c("AG", "AAG", "AAAG", "AGG", "AAGGAG", "AAGG", "AAAGG", "AAGAG", "AAGGG", "AGAGG", "AAAAAG", "AAAGAG", "AAAGGG", "AAAAAG", "AGAGGG", "AAGAGG", "AAAGGAAG", "AGGG")
data_sorted_common[,AG_repeats]
##                          AG      AAG      AAAG       AGG   AAGGAG     AAGG
## BL6/NTac           67256.45 26347.52  7296.452  8533.097 2418.486 2198.468
## BL6/NCrl           69750.00 38055.28 10579.327  8329.694 2560.842 2322.308
## BL6/NHsd           48925.53 16230.10  5543.591  7846.022 1587.977 1526.284
## BL6N-TyrCBrdCrlCrl 70561.23 38934.90 10379.190  9185.140 2827.708 2397.239
## BL6/JBomTac        66683.30 32800.02  9477.913  8461.805 2386.488 2309.149
## BL6/J              60032.36 27832.02  7200.989  9617.130 2744.191 2227.678
## BL10/ScSnJ         64246.86 40959.64  8851.526 10087.380 2986.484 2297.634
## BL10/SnJ           62270.60 42314.92 10284.460  9944.040 2981.819 2302.539
## BL10/ScCr          50443.03 17614.41  5509.765 10019.725 2079.948 1839.200
## BL6/NJ             63688.51 39818.80  9769.525  9866.128 2859.624 2327.099
## BL10/J             66349.53 41738.66 10232.616  8473.754 2656.981 2079.099
## BL6/ByJ            65096.19 36217.82 10223.577  8199.549 2427.937 2149.029
## BL6/JEiJ           62512.38 33133.06  9364.256  8260.159 2339.655 2148.763
## BL10/ScNHsd        63805.98 38974.22 10193.485  7739.205 2386.076 2139.598
##                        AAAGG     AAGAG     AAGGG    AGAGG   AAAAAG
## BL6/NTac           1736.9611 1042.5865  999.5002 535.6924 245.2341
## BL6/NCrl           1881.9835 1211.2068  960.0620 565.0340 219.2628
## BL6/NHsd            875.7659  624.7763  835.4631 449.1689 244.1678
## BL6N-TyrCBrdCrlCrl 1950.5955 1203.5164 1104.6945 629.0574 237.9968
## BL6/JBomTac        1835.7438 1230.6188  944.1000 588.7770 226.2638
## BL6/J              1656.7825 1041.2083  974.7886 573.7315 247.8997
## BL10/ScSnJ         1869.7141 1199.3150 1018.3826 583.8824 229.5712
## BL10/SnJ           1925.2236 1227.2337  983.7501 579.4867 246.9304
## BL10/ScCr           992.8838  685.8580 1007.0211 540.3200 268.6505
## BL6/NJ             1897.0839 1204.7733 1020.5747 644.0331 228.0036
## BL10/J             1762.1477 1211.4311  799.3172 492.8769 271.2413
## BL6/ByJ            1781.3128 1106.8807  881.1411 512.5859 248.5310
## BL6/JEiJ           1737.5669 1089.5450  843.0328 528.1448 258.9078
## BL10/ScNHsd        1792.5522 1162.6397  830.4409 536.9965 216.8835
##                      AAAGAG   AAAGGG AAAAAG.1   AGAGGG    AAGAGG  AAAGGAAG
## BL6/NTac           263.2982 251.1064 245.2341 223.9919 139.93000 122.36293
## BL6/NCrl           360.0023 253.2182 219.2628 198.2796 154.23436 152.96323
## BL6/NHsd           157.4740 191.8332 244.1678 184.2827  59.50754  60.94818
## BL6N-TyrCBrdCrlCrl 381.7134 297.4409 237.9968 235.0870 170.15005 139.63608
## BL6/JBomTac        359.0054 247.6448 226.2638 221.1700 132.16278 142.68280
## BL6/J              278.0870 294.7840 247.8997 187.6473 128.91459 114.64040
## BL10/ScSnJ         369.5459 297.4225 229.5712 191.3634 136.97390 135.56400
## BL10/SnJ           417.5889 312.8885 246.9304 175.6377 138.50373 138.72122
## BL10/ScCr          164.5873 247.0828 268.6505 206.0989  71.88263  60.08225
## BL6/NJ             417.2335 305.4141 228.0036 210.1792 151.32141 136.65068
## BL10/J             399.0728 264.1721 271.2413 208.6264 127.90973 144.12123
## BL6/ByJ            354.3842 240.8161 248.5310 195.0101 132.82618 143.98770
## BL6/JEiJ           320.7037 235.4233 258.9078 201.1888 118.99387 129.50019
## BL10/ScNHsd        338.5952 212.5425 216.8835 216.0247 124.51339 147.25015
##                        AGGG
## BL6/NTac           142.7499
## BL6/NCrl           123.2662
## BL6/NHsd           119.5792
## BL6N-TyrCBrdCrlCrl 146.3483
## BL6/JBomTac        133.7977
## BL6/J              122.2764
## BL10/ScSnJ         135.3885
## BL10/SnJ           125.6165
## BL10/ScCr          151.9664
## BL6/NJ             133.4303
## BL10/J             117.8604
## BL6/ByJ            110.3731
## BL6/JEiJ           132.9163
## BL10/ScNHsd        117.4796
sum(kmer_means[AG_repeats])
## [1] 125050.6
AC_repeats <- c("AC", "ACC", "AACC", "AAAAAC", "AAAACC", "AAAAACC", "AAACC")
sum(kmer_means[AC_repeats])
## [1] 192392.3
which(colnames(data_sorted_common)=="AGGG")
## [1] 51
# now make a PCA using these 63 kmers

pca.data <- prcomp(data_sorted_common, scale.=T)
library(ggbiplot)
## Loading required package: ggplot2
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2021a.
## 1.0/zoneinfo/America/New_York'
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, labels=rownames(data_sorted_common))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)  

g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8,choices=c(1,3), labels=rownames(data_sorted_common))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)

# do a PCA using more data - see if there is separation. 

get_legit_kmer_indexes <- function (kmer_mx) {
  indexes <- c()
  for (i in 1:ncol(kmer_mx)) { ## go through each column of the data
    p <- kmer_mx[,i] >= 10 ## at least 10 copies
    if (sum(p) >= 1) { # if one sample have at least 2 copies
      indexes <- c(indexes, i)
    }
  }
  return(indexes)
}
legit_indexes <- get_legit_kmer_indexes(data_sorted)
length(legit_indexes)
## [1] 434
# 434 kmers have at least 10 copies in at least one sample - use these now for the PCA. 
legit_data <- data_sorted[,legit_indexes]

legit_kmer_names <- colnames(legit_data)

# is the kmer at high density present? 
#legit_data[,"ACACACAGTGGAGCACTGTG"] # no, not present. Too high diversity
#data_sorted[,"ACACACAGTGGAGCACTGTG"]

legit_data[,"AAGCCAGCTGTGGGTAAGG"] # only low abundance. 17-25 copies
##  [1] 18.01512 18.78037 18.43254 19.54553 19.79372 20.68253 23.65253
##  [8] 23.73084 25.80129 25.45674 19.98419 19.78335 21.77381 17.85810
legit_data[,"AAAAATCTTAAAGG"] 
##  [1] 107.13230 110.76308 167.61558 102.14928 156.76916 120.55382  56.75502
##  [8] 109.61623 233.95632  81.24444  49.65902  38.63920  67.85405 303.27720
mean(legit_data[,"AAAAATCTTAAAGG"]) # highly variable, in the top 63. (range 233-38) missing from the assembly
## [1] 121.856
#write.csv(legit_data, "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/legit_data.csv")

#write.table(kb5_kmers_notkseek, file = "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/kb5_kmers_notkseek", row.names = F, col.names=F, quote=F)

#write.table(kb5_kmers_notkseek, file = "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/kb5_kmers_yeskseek", row.names = F, col.names=F, quote=F)

#write.table(legit_kmer_names, file = "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/legit_kmer_names.txt", row.names = F, col.names=F, quote=F)

# change the name
# colour them differently

rownames(legit_data)[4] <- "BL6/NTyrCBrdCrlCrl" 

strain <- sapply(strsplit(as.vector(rownames(legit_data)), "\\/"), '[', 1)


pca.data <- prcomp(legit_data, scale.=T)

g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, labels=rownames(legit_data), labels.size=2)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)

# here's some separation
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = strain, var.axes = F, alpha=0.8, choices=c(1,3))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)

g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, choices=c(1,3), labels=rownames(legit_data), labels.size=2)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)

Get the amount of satellites in base pairs in each line

get_abskmer_mx <- function (kmer_mx, indexes) {
  kmers.abs <- matrix(nrow=nrow(kmer_mx), ncol=max(indexes))
  for (i in indexes) {
    for (j in 1:nrow(kmer_mx)) {
    kmers.abs[j,i] <- kmer_mx[j,i] * (nchar(colnames(kmer_mx)[i], type="chars")) 
    }
  }
  kmers.abs <- kmers.abs[,colSums(is.na(kmers.abs)) != nrow(kmers.abs)]# get rid of columns that are all NAs
  rownames(kmers.abs) <- rownames(kmer_mx)
  colnames(kmers.abs) <- colnames(kmer_mx)
  return(kmers.abs)
}

# get total kmer content per MA line
get_total_kmercontent <- function (abs_mx) {
  total.kmers.abs <- c()
  for (i in 1:nrow(abs_mx)){
    total.kmers.abs[i] <- sum(abs_mx[i,])/10^6
  }
  names(total.kmers.abs) <- rownames(abs_mx)
  return (total.kmers.abs)
}

data_abs <- get_abskmer_mx(legit_data, c(1:ncol(legit_data)))
total_kmer_content <- get_total_kmercontent(data_abs)
total_kmer_content # note, this is in MB
##           BL6/NTac           BL6/NCrl           BL6/NHsd 
##           1.222917           1.288536           1.073765 
## BL6/NTyrCBrdCrlCrl        BL6/JBomTac              BL6/J 
##           1.393987           1.270839           1.388050 
##         BL10/ScSnJ           BL10/SnJ          BL10/ScCr 
##           1.472052           1.469211           1.179894 
##             BL6/NJ             BL10/J            BL6/ByJ 
##           1.469178           1.274572           1.285923 
##           BL6/JEiJ        BL10/ScNHsd 
##           1.215779           1.250947
boxplot(total_kmer_content, las=2)

Bring in the tree and visualize how kmer abundance evolves along the tree

library(phytools)
## Loading required package: ape
## Loading required package: maps
## 
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
## 
##     ozone
mouse_tree <- read.tree("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/manual_ext.tre")
plot(mouse_tree)

# need to rename so they match the tree

new_rownames <- gsub("/", "", names(total_kmer_content))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2

names(total_kmer_content) <- new_rownames2
#total_kmer_content

obj<-contMap(mouse_tree,total_kmer_content,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

Get kmer lengths and GC content

Calc_kmer_length <- function (kmer_string) {
  return (nchar(kmer_string))
}
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.3.3
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plyr':
## 
##     rename
## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.3.3
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:plyr':
## 
##     desc
## Loading required package: XVector
## Warning: package 'XVector' was built under R version 3.3.3
## 
## Attaching package: 'XVector'
## The following object is masked from 'package:plyr':
## 
##     compact
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
Calc_GC_content <- function (kmer_string) {
  string <- BString(kmer_string)
  gc.content <- c(letterFrequency(string, letters=c("CG"), as.prob=F)/(length(string)))
  return(gc.content)
}
common.kmers <- colnames(data_sorted_common)

common.kmer.lengths <- sapply(common.kmers, Calc_kmer_length)
names(common.kmer.lengths) <- common.kmers
#common.kmer.lengths[1:10]

common.kmer.gc <- sapply(common.kmers, Calc_GC_content)
names(common.kmer.gc) <- common.kmers
#common.kmer.gc[1:10]

hist(common.kmer.lengths, breaks=20)

summary(common.kmer.lengths) # median = 6
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   4.000   6.000   6.524   6.000  19.000
length(which(common.kmer.lengths==4)) # 11 
## [1] 11
length(which(common.kmer.lengths==5)) #8 
## [1] 8
length(which(common.kmer.lengths==6)) # 22
## [1] 22
length(common.kmer.lengths)
## [1] 63
hist(common.kmer.gc, breaks=10)

summary(common.kmer.gc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3333  0.5000  0.4456  0.5895  0.7500
# look at legit kmers now
legit.kmers <- colnames(legit_data)
legit.kmer.lengths <- sapply(legit.kmers, Calc_kmer_length)
names(legit.kmer.lengths) <- legit.kmers

legit.kmer.gc <- sapply(legit.kmers, Calc_GC_content)
names(legit.kmer.gc) <- legit.kmers


hist(legit.kmer.lengths, breaks=20)

hist(legit.kmer.gc, breaks=10)

# note: telomere satellite is common.kmers[4]:
common.kmers[4]
## [1] "AACCCT"
total_kmer_content
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         1.222917         1.288536         1.073765         1.393987 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         1.270839         1.388050         1.472052         1.469211 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         1.179894         1.469178         1.274572         1.285923 
##           B6JEiJ        B10ScNHsd 
##         1.215779         1.250947

Boxplot of abundant kmers

abun_data <- c()
for (i in 1:ncol(data_sorted_common)) {
  abun_data <- c(abun_data, data_sorted_common[,i])
}

kmer_names <- c()
for (i in 1:ncol(data_sorted_common)) {
  kmer_names <- c(kmer_names, c(rep(colnames(data_sorted_common)[i], times = nrow(data_sorted_common))))
}

rownames(data_sorted_common)[4] <- "BL6/NTyrCBrdCrlCrl" 
abun_df <- data.frame(rownames(data_sorted_common), kmer_names, abun_data)
abun_df[1:15, ]
##    rownames.data_sorted_common. kmer_names abun_data
## 1                      BL6/NTac         AC 185511.68
## 2                      BL6/NCrl         AC 182274.28
## 3                      BL6/NHsd         AC 182867.67
## 4            BL6/NTyrCBrdCrlCrl         AC 192672.86
## 5                   BL6/JBomTac         AC 188531.48
## 6                         BL6/J         AC 202091.49
## 7                    BL10/ScSnJ         AC 202385.62
## 8                      BL10/SnJ         AC 199890.74
## 9                     BL10/ScCr         AC 204310.46
## 10                       BL6/NJ         AC 197374.94
## 11                       BL10/J         AC 175373.72
## 12                      BL6/ByJ         AC 178864.78
## 13                     BL6/JEiJ         AC 180017.66
## 14                  BL10/ScNHsd         AC 182467.11
## 15                     BL6/NTac         AG  67256.45
colnames(abun_df) <- c("Sample", "kmer", "Copy")
strain <- sapply(strsplit(as.vector(abun_df$Sample), "\\/"), '[', 1)

abun_df <- data.frame(abun_df, strain)
#abun_df


library(ggplot2)
abun_df$kmer <- factor(abun_df$kmer, levels=colnames(data_sorted_common))
# this plot needs some work
ggplot(data = abun_df[c(1:70),], aes(x = kmer, y = Copy)) + 
  geom_boxplot() +
  geom_jitter(aes(color=strain)) +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1)) 

ggplot(data = abun_df[c(71:140),], aes(x = kmer, y = Copy)) + 
  geom_boxplot() +
  geom_jitter(aes(color=strain)) +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))

Now, calculate mutation rates: Set it up with functions

# note, this part is needed to make the table I import in:
mouse_tree$tip.label
##  [1] "B10SnJ"           "B10ScSnJ"         "B10ScCr"         
##  [4] "B10ScNHsd"        "B10J"             "B6ByJ"           
##  [7] "B6NCrl"           "B6NHsd"           "B6NTac"          
## [10] "B6NTyrCBrdCrlCrl" "B6NJ"             "B6JEiJ"          
## [13] "B6JBomTac"        "B6J"
mouse_tree$Nnode # 13 nodes now
## [1] 13
findMRCA(mouse_tree, tips=mouse_tree$tip.label, type="node")
## [1] 15
findMRCA(mouse_tree, tips=c("B6J", "B6ByJ"), type="node")
## [1] 20
findMRCA(mouse_tree, tips=c("B6J", "B6JEiJ"), type="node")
## [1] 26
findMRCA(mouse_tree, tips=c("B6J", "B6JBomTac"), type="node")
## [1] 27
findMRCA(mouse_tree, tips=c("B6NJ", "B6ByJ"), type="node")
## [1] 21
findMRCA(mouse_tree, tips=c("B6NJ", "B6NCrl"), type="node")
## [1] 22
findMRCA(mouse_tree, tips=c("B6NJ", "B6NHsd"), type="node")
## [1] 23
findMRCA(mouse_tree, tips=c("B6NJ", "B6NTac"), type="node")
## [1] 24
findMRCA(mouse_tree, tips=c("B6NJ", "B6NTyrCBrdCrlCrl"), type="node")
## [1] 25
findMRCA(mouse_tree, tips=c("B10J", "B10SnJ"), type="node")
## [1] 16
findMRCA(mouse_tree, tips=c("B10J", "B10ScSnJ"), type="node")
## [1] 17
findMRCA(mouse_tree, tips=c("B10J", "B10ScCr"), type="node")
## [1] 18
findMRCA(mouse_tree, tips=c("B10J", "B10ScNHsd"), type="node")
## [1] 19
# here is the node info file:
node_info <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/node_info_Mar17-20.csv", header = T)

# here is a function to calculate mutation rates for our tree, given an abundance matrix

calc_mutation_rates <- function (df_abun) {
  mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
  for (i in 1:ncol(df_abun)) {
    kmer_subset <- df_abun[,i]
    names(kmer_subset) <- rownames(df_abun)
    anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T)
    for (j in 1:nrow(df_abun)) {
      subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
      mu_mx[j,i] <- (df_abun[j,i] - anc_states$ace[subset_table$nearest_node]) / subset_table$gens
    }
  }
  rownames(mu_mx) <- rownames(df_abun)
  colnames(mu_mx) <- colnames(df_abun)
  return(mu_mx)
}

# check for total kmer content
anc_states <- fastAnc(mouse_tree, total_kmer_content, vars=T, CI=T)
#anc_states
1.073765 - 1.264150 # lost by B6NHsd since common ancestor with below
## [1] -0.190385
1.469178 - 1.264150 # gained by B6NJ since common ancestor with below
## [1] 0.205028
# here is a function that calculates normalized-by-copy-number mutation rates, given the original abundance matrix and the non-normalized mutation matrix

calc_norm_mu <- function (df_abun, mu_mx) {
  mu_mx_norm <- matrix(nrow=nrow(mu_mx), ncol=ncol(mu_mx))
  for (i in 1:ncol(mu_mx)) {
    kmer_subset <- df_abun[,i]
    names(kmer_subset) <- rownames(df_abun)
    anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T) # get the ancestral copy number
    for (j in 1:nrow(mu_mx)) {
      subset_table <- node_info[which(node_info$Sample==rownames(mu_mx)[j]),]
      mu_mx_norm[j,i] <- mu_mx[j,i]/anc_states$ace[subset_table$nearest_node]
    }
  }
  rownames(mu_mx_norm) <- rownames(mu_mx)
  colnames(mu_mx_norm) <- colnames(mu_mx)
  return(mu_mx_norm)
}

# this is a function that calculates absolute mutation rates (taking the absolute value of additions and subtractions).

calc_mutation_rates_abs <- function (df_abun) {
  mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
  for (i in 1:ncol(df_abun)) {
    kmer_subset <- df_abun[,i]
    names(kmer_subset) <- rownames(df_abun)
    anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T)
    for (j in 1:nrow(df_abun)) {
      subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
      mu_mx[j,i] <- (abs(df_abun[j,i] - anc_states$ace[subset_table$nearest_node])) / subset_table$gens
    }
  }
  rownames(mu_mx) <- rownames(df_abun)
  colnames(mu_mx) <- colnames(df_abun)
  return(mu_mx)
}

Now, do the mutation rate calculations for the common kmers

# fix the rownames so they match
new_rownames <- gsub("/", "", rownames(data_sorted_common))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2

rownames(data_sorted_common) <- new_rownames2

mu_common <- calc_mutation_rates(data_sorted_common)
per_kmer_mu<- colMeans(mu_common)
length(per_kmer_mu)
## [1] 63
kmer_means_sorted <- sort (kmer_means, decreasing = T)

# check for correlation of abundance with copy number.
plot(x = kmer_means_sorted[1:63], y = per_kmer_mu )

plot (x = kmer_means_sorted[7:63], y = per_kmer_mu[7:63])

# generally a positive correlation

# for each line, do a sign test to determine if kmers have randomly and independently gone up and down - would expect half would be net increase and half would be net decrease
binom_results <- c()
for (i in 1:nrow(mu_common)) {
  num_neg <- length(which(mu_common[i, ] < 0))
  total <- ncol(mu_common)
  p <- binom.test(num_neg, total)
  binom_results <- c(binom_results, p$p.value)
}
binom_results
##  [1] 8.980472e-04 1.000000e+00 5.152397e-03 4.295655e-02 3.135031e-01
##  [6] 5.152397e-03 2.227532e-03 3.760612e-05 8.013065e-01 3.367093e-04
## [11] 1.000000e+00 1.000000e+00 8.980472e-04 4.295655e-02
rownames(mu_common)[which(binom_results < 0.01)]
## [1] "B6NTac"   "B6NHsd"   "B6J"      "B10ScSnJ" "B10SnJ"   "B6NJ"    
## [7] "B6JEiJ"
# many lines have more increases than decreases or vise versa  

Plot the relationship between kmer abundance and copy number change rate in each line

### now do for each line - plot the common kmers abundance and mutation rate

#install.packages("ggallin")
library("ggallin")

rownames(data_sorted_common[1,])
## [1] "B6NTac"
a <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[1,]), y = as.numeric(mu_common[1,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NTac") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[2,])
## [1] "B6NCrl"
b <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[2,]), y = as.numeric(mu_common[2,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NCrl") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[3,])
## [1] "B6NHsd"
c <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[3,]), y = as.numeric(mu_common[3,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NHsd") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[4,])
## [1] "B6NTyrCBrdCrlCrl"
d <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[4,]), y = as.numeric(mu_common[4,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NTyrCBrdCrlCrl") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[5,])
## [1] "B6JBomTac"
e <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[5,]), y = as.numeric(mu_common[5,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6JBomTac") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")


rownames(data_sorted_common[6,])
## [1] "B6J"
f <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[6,]), y = as.numeric(mu_common[6,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6J") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[7,])
## [1] "B10ScSnJ"
g <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[7,]), y = as.numeric(mu_common[7,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10ScSnJ") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[8,])
## [1] "B10SnJ"
h <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[8,]), y = as.numeric(mu_common[8,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10SnJ") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[9,])
## [1] "B10ScCr"
i <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[9,]), y = as.numeric(mu_common[9,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10ScCr") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[10,])
## [1] "B6NJ"
j <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[10,]), y = as.numeric(mu_common[10,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NJ") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[11,])
## [1] "B10J"
k <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[11,]), y = as.numeric(mu_common[11,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10J") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[12,])
## [1] "B6ByJ"
l <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[12,]), y = as.numeric(mu_common[12,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6ByJ") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[13,])
## [1] "B6JEiJ"
m <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[13,]), y = as.numeric(mu_common[13,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6JEiJ") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

rownames(data_sorted_common[14,])
## [1] "B10ScNHsd"
n <- ggplot () +
  geom_point(aes (x = as.numeric(data_sorted_common[14,]), y = as.numeric(mu_common[14,]), alpha = 0.5)) +
  labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10ScNHsd") +
  scale_y_continuous(trans = pseudolog10_trans) +
  scale_x_log10() +
  theme(legend.position = "none")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
grid.arrange(b,g,j,c,a,d,e,f,h,i,k,l,m,n, ncol=2)

Next, normalize the kmers by copy number, also calculate absolute mutation rates

#normalize by copy number

mu_common_norm <- calc_norm_mu(data_sorted_common, mu_common)


# per kmer average mutation rate

per_kmer_mu_norm_mean <- colMeans(mu_common_norm)
par(mar = c(2, 2, 2, 2))
plot(per_kmer_mu_norm_mean)

# one kmer has the most negative mutatation rate
which(per_kmer_mu_norm_mean==min(per_kmer_mu_norm_mean)) # normalized by copy number, most contracting
## AAAGACAGACAG 
##           27
# this is the one with phylogenetic signal
# verified this one manually to make sure there was no mistake.  

highest_kmer <- data_sorted_common[,27]
names(highest_kmer) <- rownames(data_sorted_common)

# look at amounts at each node
anc_states <- fastAnc(mouse_tree, highest_kmer, vars=T, CI=T)
anc_states$ace
##        15        16        17        18        19        20        21 
## 420.27294  72.76111  57.51539  43.91212  21.99731 767.78476 788.11806 
##        22        23        24        25        26        27 
## 805.51032 851.40148 864.22009 872.23266 824.66497 846.29339
obj<-contMap(mouse_tree,highest_kmer,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

par(mar = c(2, 2, 2, 2))
per_line_mu_norm_mean <- rowMeans(mu_common_norm)
plot(per_line_mu_norm_mean) # line 10 is most increasing, line 3 is most decreasing

per_line_mu_norm_mean
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##    -5.108446e-04     1.018382e-04    -9.047817e-04     2.279445e-04 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##    -2.278171e-04     4.288457e-04     1.220611e-04     1.862966e-04 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##    -3.973537e-04     8.150201e-04     2.706781e-05    -3.471784e-05 
##           B6JEiJ        B10ScNHsd 
##    -1.783642e-04    -3.200731e-04
# look at the telomere satellite

telomere_kmer <- data_sorted_common[,4]
names(telomere_kmer) <- rownames(data_sorted_common)

telomere_kmer
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         30146.60         24934.31         26172.35         31314.12 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         25013.94         45939.98         46029.90         40211.11 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         35171.94         39748.29         22781.08         30818.52 
##           B6JEiJ        B10ScNHsd 
##         24649.66         26089.82
# look at amounts at each node
anc_states <- fastAnc(mouse_tree, telomere_kmer, vars=T, CI=T)
anc_states$ace
##       15       16       17       18       19       20       21       22 
## 33122.94 35614.98 35362.01 33648.22 28314.51 30630.91 30000.19 29604.91 
##       23       24       25       26       27 
## 30121.95 31568.94 33730.17 30980.68 33096.57
obj<-contMap(mouse_tree,telomere_kmer,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# range 22781 - 46030 - factor of 2.

# look at CAG repeats (polyQ)

which(names((data_sorted_common))=="AGC")
## [1] 25
poly_Q <- data_sorted_common[,25]
names(poly_Q) <- rownames(data_sorted_common)

anc_states <- fastAnc(mouse_tree, poly_Q, vars=T, CI=T)
anc_states$ace
##       15       16       17       18       19       20       21       22 
## 587.9395 588.3390 589.6813 588.8991 584.2025 587.5399 584.7799 584.6911 
##       23       24       25       26       27 
## 585.0344 585.1220 585.3344 591.7505 587.1016
obj<-contMap(mouse_tree,poly_Q,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

poly_Q # very tight: 572-613 copies
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         584.8186         582.6070         585.9047         588.1745 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         572.0243         591.7187         609.6128         573.4039 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         595.1478         582.8482         577.5038         577.1086 
##           B6JEiJ        B10ScNHsd 
##         613.4282         584.0698
## Let's make a box plot of mutation rates ###

head(mu_common_norm)
##                             AC            AG           AAG        AACCCT
## B6NTac           -1.665755e-04  0.0004075430 -0.0016936235 -0.0004374261
## B6NCrl           -1.037055e-04  0.0005307961  0.0010510781 -0.0008913244
## B6NHsd           -1.520792e-04 -0.0015642071 -0.0033459144 -0.0009501471
## B6NTyrCBrdCrlCrl  4.070876e-05  0.0010917158  0.0012808578 -0.0011192028
## B6JBomTac        -1.399663e-04  0.0004652882  0.0004670982 -0.0021052905
## B6J               4.700042e-04 -0.0004409409 -0.0009093677  0.0033453313
##                           AGAT        AGAGGC          AAAG           AGG
## B6NTac           -0.0036641401 -3.605975e-04 -0.0015142443 -3.295951e-04
## B6NCrl            0.0015920712 -3.000287e-04  0.0010430853 -1.410237e-04
## B6NHsd           -0.0043442036  2.057965e-06 -0.0024647120 -6.330540e-04
## B6NTyrCBrdCrlCrl  0.0009625905  2.326053e-04  0.0015827283 -4.349942e-05
## B6JBomTac         0.0017003030 -2.519822e-05  0.0008803666 -3.848845e-04
## B6J              -0.0017989410  2.586653e-04 -0.0014021171  7.395840e-04
##                          ACAGC        AAGGAG          AAGG           ACC
## B6NTac           -6.017285e-05 -0.0002962075  7.938251e-05 -0.0006369315
## B6NCrl           -6.596555e-04  0.0003701609  4.853483e-04 -0.0001221603
## B6NHsd            3.251943e-04 -0.0023512420 -1.946080e-03 -0.0002215841
## B6NTyrCBrdCrlCrl  1.249040e-03  0.0008304876  8.059319e-04 -0.0005678451
## B6JBomTac        -2.921477e-04 -0.0004308801  2.956358e-04 -0.0005847944
## B6J               8.914476e-04  0.0007966639 -1.894608e-05  0.0008934752
##                           ATCC         AAAGG          ACAG
## B6NTac           -4.760371e-04  0.0001649562 -9.431094e-05
## B6NCrl            2.326906e-05  0.0007292390 -8.334183e-05
## B6NHsd           -9.738023e-04 -0.0032640963 -1.418911e-03
## B6NTyrCBrdCrlCrl -2.380564e-04  0.0010688596  2.446654e-03
## B6JBomTac        -2.671971e-04  0.0004943666  3.432759e-04
## B6J               5.718671e-04 -0.0003942336  6.148712e-05
##                  AAGTCTGCACACACATACT          ACCT        ACACAG
## B6NTac                 -0.0003693063 -0.0005101438 -0.0002956678
## B6NCrl                 -0.0007367637  0.0006628411 -0.0001270817
## B6NHsd                  0.0003864204 -0.0042674535 -0.0001753629
## B6NTyrCBrdCrlCrl       -0.0006380546  0.0026445685  0.0004140957
## B6JBomTac              -0.0012574995  0.0006555566 -0.0003819965
## B6J                     0.0017008338 -0.0005348536  0.0006864821
##                            AT         AAGAG         AAGGG      AGAGAGGC
## B6NTac           -0.002822038 -0.0002509086  8.403835e-05 -4.090651e-04
## B6NCrl            0.001638562  0.0007967816  9.550605e-05 -9.735448e-05
## B6NHsd           -0.004908572 -0.0027839155 -8.956826e-04 -9.252141e-05
## B6NTyrCBrdCrlCrl -0.001023197  0.0008232176  1.132484e-03  8.045671e-04
## B6JBomTac         0.001204549  0.0008527776  5.331611e-05 -2.558212e-05
## B6J              -0.000407570 -0.0006053300  3.352709e-04  1.654990e-04
##                         ACATAT ACAGACAGCACAGCACAGC           AGC
## B6NTac           -8.973783e-04       -0.0001011831 -5.035019e-06
## B6NCrl            3.277600e-04       -0.0006282529 -2.013794e-05
## B6NHsd           -8.395672e-04        0.0005911077  1.077951e-05
## B6NTyrCBrdCrlCrl  6.342445e-04        0.0012820218  7.581416e-05
## B6JBomTac        -7.849088e-05       -0.0004538098 -2.213871e-04
## B6J               2.805531e-04        0.0008161647  6.779565e-05
##                          AGAGG  AAAGACAGACAG AAAAACAAGGGAGATAT
## B6NTac           -5.451441e-04  0.0001919794     -0.0006024016
## B6NCrl            1.976184e-04 -0.0007518187     -0.0007702676
## B6NHsd           -1.271293e-03  0.0009523442      0.0027391458
## B6NTyrCBrdCrlCrl  6.162522e-04 -0.0005997973     -0.0013208665
## B6JBomTac         3.186189e-04 -0.0010862144     -0.0010467956
## B6J               9.018506e-05  0.0015819253      0.0009937361
##                           AGGC ACAGACAGCACAGC          AACC         AAAAG
## B6NTac           -0.0020117145   4.751776e-05 -6.836915e-04  0.0001320768
## B6NCrl           -0.0016632974  -7.066192e-04  2.378089e-04  0.0001860139
## B6NHsd            0.0004286222   6.277708e-04  7.886844e-05  0.0001341426
## B6NTyrCBrdCrlCrl -0.0024026780   1.163767e-03  3.846697e-04  0.0003208179
## B6JBomTac        -0.0044165306  -4.766036e-04  2.781311e-04  0.0003026365
## B6J               0.0048059615   9.447221e-04 -3.292601e-04 -0.0004063741
##                         AAAGAG        AGATAT        AGCAGG        ACAGAT
## B6NTac           -0.0016948721 -2.063519e-03 -8.270165e-04 -5.103155e-04
## B6NCrl            0.0008036940  1.502680e-03 -4.141238e-04  2.764000e-04
## B6NHsd           -0.0034238043 -1.268949e-03  5.412703e-05 -2.417208e-03
## B6NTyrCBrdCrlCrl  0.0008106058 -2.490464e-04 -5.730695e-04  9.071935e-05
## B6JBomTac         0.0010697903 -2.440358e-04 -9.435134e-04  2.287869e-04
## B6J              -0.0011144041  6.836742e-05  1.548868e-03  1.104506e-04
##                         AAAGGG        AACAAG        AAAAAG       AGGAGGC
## B6NTac           -4.677551e-04  2.546155e-05  0.0002801716 -9.187078e-04
## B6NCrl            6.409462e-05  8.371351e-04 -0.0004327796 -5.548012e-04
## B6NHsd           -1.681945e-03 -2.853162e-03  0.0001672457  1.284541e-05
## B6NTyrCBrdCrlCrl  7.203940e-04  2.164467e-04  0.0001700484 -1.205617e-03
## B6JBomTac        -5.027060e-04 -6.728850e-04 -0.0005210630 -5.620871e-04
## B6J               1.042552e-03  8.818891e-07  0.0002534426  1.029203e-03
##                           AAGC        AGAGGG AGAGAGGCAGAGGC         AAAGC
## B6NTac           -5.434884e-04  0.0004652802  -3.312943e-04  3.483304e-04
## B6NCrl            7.403019e-04 -0.0001436764   8.352441e-05  7.155242e-05
## B6NHsd           -3.233108e-04 -0.0007647574  -1.441311e-04  8.826565e-05
## B6NTyrCBrdCrlCrl  2.236890e-04  0.0011788727   4.182530e-04  5.895693e-04
## B6JBomTac        -6.453794e-05  0.0007462053   3.510603e-05  7.007548e-04
## B6J               2.297196e-04 -0.0006735322   2.077488e-04 -6.941516e-04
##                         ACAGGC AAACAAGGGAGATAT        ACAGAG          ACAT
## B6NTac           -0.0005248930   -0.0008268581  0.0008632881 -0.0010080044
## B6NCrl           -0.0008088567   -0.0003203606 -0.0001639874 -0.0001848299
## B6NHsd           -0.0002515918    0.0009086674 -0.0005174375 -0.0003440389
## B6NTyrCBrdCrlCrl -0.0003289970   -0.0006655922 -0.0005338290  0.0014361438
## B6JBomTac        -0.0005080720   -0.0006975355 -0.0002898178 -0.0005412730
## B6J               0.0012970047    0.0008500562  0.0006739503  0.0003718677
##                            ACT        AGAGAT          AGGG        AAAAAC
## B6NTac           -3.081124e-04 -0.0029479031  0.0005669797  0.0002932379
## B6NCrl           -7.990339e-05  0.0013842437 -0.0001663411 -0.0012957452
## B6NHsd           -3.130408e-03 -0.0038984548 -0.0005714235  0.0046463337
## B6NTyrCBrdCrlCrl -8.992007e-04 -0.0007953841  0.0009922715 -0.0014301993
## B6JBomTac         3.735892e-04  0.0001349109  0.0003589676 -0.0077436549
## B6J               2.365849e-04 -0.0005866912 -0.0004142656  0.0056165427
##                         AAGAGG      AAAGGAAG AAAAATCTTAAAGG        ACTGCT
## B6NTac            0.0001734283 -0.0001211827   6.059315e-05 -0.0005373543
## B6NCrl            0.0010380652  0.0011953256   2.555483e-04 -0.0008290947
## B6NHsd           -0.0038070492 -0.0034883860   3.463674e-03 -0.0004780371
## B6NTyrCBrdCrlCrl  0.0020798018  0.0009448833   5.935478e-04 -0.0011862780
## B6JBomTac         0.0002366203  0.0009352782   2.402745e-03 -0.0007848794
## B6J               0.0000189325 -0.0009428198  -1.437855e-04  0.0004148049
##                         ACACAT         AAACC        AAAACC       AAAAACC
## B6NTac           -0.0003328363  4.595988e-05 -0.0008159407  0.0002409840
## B6NCrl            0.0003042913  1.862295e-04 -0.0006498141 -0.0003272137
## B6NHsd           -0.0010053734 -3.836788e-04  0.0013755596  0.0021085351
## B6NTyrCBrdCrlCrl -0.0017003803 -3.084245e-04 -0.0009651283 -0.0009297406
## B6JBomTac        -0.0006450958  2.966610e-04 -0.0007742960 -0.0015244409
## B6J               0.0012565483 -2.042011e-04  0.0012831101  0.0017776114
##                       ACACATAT  AGCAGGAGGAGG        ACATAG
## B6NTac           -0.0016465201 -4.637637e-06 -9.896685e-04
## B6NCrl            0.0014689497 -5.021070e-04  9.122955e-04
## B6NHsd           -0.0040666019 -2.520583e-05 -2.393602e-03
## B6NTyrCBrdCrlCrl  0.0010072023  3.323291e-04  3.911062e-04
## B6JBomTac         0.0009190992 -3.435794e-04 -6.417093e-05
## B6J              -0.0009905018  4.401858e-04  8.034789e-04
# need to get it in format to make a box plot. Want to use mu_common_norm

mutation_rate_data <- c()
kmer_names <- c()

for (i in 1:nrow(mu_common_norm)) {
  mutation_rate_data <- c(mutation_rate_data, as.numeric(mu_common_norm[i,]) )
  kmer_names <- c(kmer_names, colnames(mu_common_norm))
}

mutation_rate_df <- data.frame(mutation_rate_data, kmer_names)
head(mutation_rate_df)
##   mutation_rate_data kmer_names
## 1      -0.0001665755         AC
## 2       0.0004075430         AG
## 3      -0.0016936235        AAG
## 4      -0.0004374261     AACCCT
## 5      -0.0036641401       AGAT
## 6      -0.0003605975     AGAGGC
# a plot for the hypermutators - part I (later add in complex)
# add in all the 14 samples
subset_hypermutators <- mu_common_norm[which(rownames(mu_common_norm)=="B6NHsd" | rownames(mu_common_norm)=="B6NJ"),]

# make this into a dataframe that can be a boxplot.
abun_hypermut <- c()
for (j in 1:ncol(mu_common_norm)) {
  for (i in 1:nrow(mu_common_norm)) {
    abun_hypermut <- c(abun_hypermut, mu_common_norm[i,j])
  }
}

line_names <- c(rep(rownames(mu_common_norm), times = ncol(mu_common_norm) ))

kmer_names <- c()
for (i in 1:ncol(mu_common_norm)) {
  kmer_names <- c(kmer_names, c(rep(colnames(mu_common_norm)[i], times = nrow(mu_common_norm))))
}

#now make a dataframe

hypermutator_df <- data.frame(line_names, kmer_names, abun_hypermut)
#abun_df

ggplot(data = hypermutator_df, aes(x = line_names, y = abun_hypermut)) + 
  geom_boxplot() +
  geom_jitter(aes(color=line_names)) +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1)) 

# now absolute mutation rates  

mu_common_abs <- calc_mutation_rates_abs(data_sorted_common)
mu_common_abs_norm <- calc_norm_mu(data_sorted_common, mu_common_abs)

per_kmer_mu_abs_norm_mean <- colMeans(mu_common_abs_norm)
sort(per_kmer_mu_abs_norm_mean, decreasing=T)[1:10]
## AAAAATCTTAAAGG         AAAAAC   AAAGACAGACAG           AGAT           AGGC 
##    0.002199034    0.002024336    0.001949065    0.001690533    0.001666607 
##             AT         AGAGAT       ACACATAT         AACCCT         AAAGAG 
##    0.001665801    0.001490404    0.001242575    0.001194939    0.001164543
per_line_mu_abs_norm_mean <- rowMeans(mu_common_abs_norm)

plot(per_kmer_mu_abs_norm_mean) # huh, when we take the absolute value of changes, it doesn't stand out.  Only when we take the directional value

which(per_kmer_mu_abs_norm_mean==max(per_kmer_mu_abs_norm_mean))
## AAAAATCTTAAAGG 
##             55
range(per_kmer_mu_abs_norm_mean)
## [1] 7.796324e-05 2.199034e-03
sort (per_kmer_mu_abs_norm_mean)
##                 AGC      AGAGAGGCAGAGGC                  AC 
##        7.796324e-05        1.682046e-04        2.065710e-04 
##            AGAGAGGC                AACC              AGAGGC 
##        2.146289e-04        2.208613e-04        2.348638e-04 
##               AAAAG        AGCAGGAGGAGG              ACACAG 
##        2.540561e-04        2.579746e-04        2.894117e-04 
##                AAGC               AAACC               AAAGC 
##        3.236761e-04        3.358232e-04        3.458564e-04 
##              ACATAT              AAAAAG               AAGGG 
##        3.556181e-04        3.626719e-04        3.646905e-04 
##                AAGG                 AGG              ACAGAG 
##        3.823177e-04        3.945726e-04        4.020140e-04 
##               AGAGG              AGAGGG                AGGG 
##        4.021618e-04        4.082530e-04        4.225192e-04 
##                ATCC                 ACC                ACAT 
##        4.298685e-04        4.387321e-04        4.458417e-04 
##                  AG     AAACAAGGGAGATAT                ACAG 
##        4.701614e-04        5.491447e-04        5.680881e-04 
##              ACAGGC              AACAAG              AAGGAG 
##        6.252987e-04        6.364125e-04        6.367376e-04 
##              AAAGGG AAGTCTGCACACACATACT              ACAGAT 
##        6.376478e-04        6.483114e-04        6.495259e-04 
##              ACACAT              ACTGCT             AGGAGGC 
##        6.591535e-04        6.928947e-04        6.999353e-04 
##               AAGAG              AGCAGG               AAAGG 
##        7.261712e-04        7.288115e-04        7.304379e-04 
##              ACATAG              AAGAGG              AAAACC 
##        7.437793e-04        7.910306e-04        7.939359e-04 
##   AAAAACAAGGGAGATAT              AGATAT            AAAGGAAG 
##        8.620851e-04        8.624179e-04        9.217655e-04 
##                 ACT                AAAG             AAAAACC 
##        9.223615e-04        9.625643e-04        9.724336e-04 
##               ACAGC                ACCT                 AAG 
##        9.962429e-04        1.057811e-03        1.098304e-03 
##      ACAGACAGCACAGC ACAGACAGCACAGCACAGC              AAAGAG 
##        1.129597e-03        1.139886e-03        1.164543e-03 
##              AACCCT            ACACATAT              AGAGAT 
##        1.194939e-03        1.242575e-03        1.490404e-03 
##                  AT                AGGC                AGAT 
##        1.665801e-03        1.666607e-03        1.690533e-03 
##        AAAGACAGACAG              AAAAAC      AAAAATCTTAAAGG 
##        1.949065e-03        2.024336e-03        2.199034e-03
ordered_by_mu <- names(sort (per_kmer_mu_abs_norm_mean))

# let's sort the kmers in order of increasing mutation rate, then change the order in 

mutation_rate_df$kmer_names <- factor(mutation_rate_df$kmer_names, levels = ordered_by_mu)

# need to mess with the dimensions
par(mar = c(7,5,2,2))
boxplot (mutation_rate_df[,1] ~ mutation_rate_df[,2], data=mutation_rate_df, las=2, cex.axis=0.5, ylab="Copies changed/gen/copy", outline = F)
abline(h=0, col="red", lwd=2)

# AGC actually has the lowest mutation rate (can I give it a Z score?)
mu <- mean(per_kmer_mu_abs_norm_mean)
sigma <- sd(per_kmer_mu_abs_norm_mean)
(AGC_zscore <- abs(7.584906e-05 - mu)/sigma)
## [1] 1.384379
AGAT <- data_sorted_common[,5]
names(AGAT) <- rownames(data_sorted_common)
obj<-contMap(mouse_tree,AGAT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

par(mar = c(2, 2, 2, 2))
plot(per_line_mu_abs_norm_mean) # 3 and 10 are the highest

mean(mu_common_abs_norm)
## [1] 0.0007450784
# 7.45 E-4 - lower than Daphnia, higher than Chlamy
# very similar to daphnia, daphnia is 2.74 E-3. 

# kmers with the highest standard deviation

sd <- c()
for (i in 1:ncol(legit_data)) {
  sd <- c(sd, sd(legit_data[,i]))
}
sd
##   [1] 9957.1099033 6312.6563626 8606.9135939 8040.9738460 7767.0997694
##   [6]  570.5964272 1789.8022690  853.7009587 1254.8007746  378.3459036
##  [11]  230.3492923  183.1972275  188.7837067  332.2424407  335.1086236
##  [16]  169.6278825  327.5525751   79.1920233  396.9262071  195.3265407
##  [21]   90.6380252   40.1450147   48.8291491  176.3744513   12.2412836
##  [26]   51.8297817  426.2687198  111.5508598  208.0048271  120.4103667
##  [31]   18.2018852   18.9062639   83.6433266   63.1129763   52.7363957
##  [36]   45.6476752   36.3686831   42.5154621   16.9383147   39.5320561
##  [41]   17.0625560   16.6665497    7.8421324    9.6791553   25.5961594
##  [46]   16.3376868    9.3763077   18.6968278   28.5239660   38.9115751
##  [51]   12.0728024   57.0572337   29.5033287   29.6012729   73.7009512
##  [56]   16.7066702   13.2541626    9.1632612   15.5622560   28.3120289
##  [61]   28.2982676    8.4674822   16.9534867    6.5888211   30.3089501
##  [66]   11.0937146    4.2005369   10.9788189   18.0298987   19.3507631
##  [71]   34.8572776   56.1160209    7.4906817   40.5618313   19.9825589
##  [76]    9.0076942   25.2730707    7.5890319   26.5252713   57.4568452
##  [81]   10.7861078    2.7511383   12.0045824    6.1175097    4.8028498
##  [86]   10.9729619   14.7432785   13.7463975    7.5361169    6.5703485
##  [91]    9.9003758    9.4386799    8.7701417    4.3872420   11.3718690
##  [96]    3.7426031    5.3193396    4.2162387    5.0405273    5.5037682
## [101]   22.5642308   19.1164460    7.9308215   36.4249219    9.6125092
## [106]    6.5136700    6.2043048    2.9429164    3.9857219    8.8981661
## [111]    5.4297570    6.9444788    5.8663832    2.6372233   11.6857256
## [116]    1.8536703    3.4002931    8.7891692    5.5076366    6.0821834
## [121]    6.5291210    8.4407447   10.0860729    3.7469204    4.2874332
## [126]    2.5586677    2.5250416    2.9386673    2.3428130    6.2776984
## [131]    5.3380183    4.1794643   22.8362892    7.5265612    1.7055315
## [136]    3.6251102    5.4020038    6.3895524    4.4102022    1.8506297
## [141]    4.0404523    3.1878192    7.7663491    6.6728866    2.2278659
## [146]    5.0653438    3.6532146    5.3769198    2.2550674   15.1614854
## [151]    5.0684435    1.7931575    5.4407406    3.1444834    9.5947008
## [156]    4.9877285    9.2424676   13.6021024    8.7173894    4.0570806
## [161]    1.4314800    3.6177478    3.7849046    4.6055801    8.4295643
## [166]    2.5659941    4.5006411    6.7777854    1.7749897    2.8129887
## [171]    1.5803899    3.4992244    1.8896081    2.8183649    7.5265216
## [176]    2.0386250    1.9564274    5.8145051    3.2509880    2.9654171
## [181]    1.5851584    2.6954999    6.0079811    3.6830944    2.8845621
## [186]    1.1851385    6.0994855    1.0799653    3.5854369    1.4394000
## [191]    5.5539800    3.2340652   10.5070088    3.4647804    1.3510858
## [196]    2.0655693    1.7046625    6.5714931    9.0683079    4.0883660
## [201]    7.3734443    6.1834480    2.6100220    1.7239635    7.0765073
## [206]    2.0439891    6.3740054    7.0660472    4.2871014    3.2467072
## [211]    4.3908385    1.9508601    1.0489485    2.3859475    1.4774957
## [216]    2.9289521    1.7513095    2.9867118    2.6466406    1.8556864
## [221]    3.7742848    4.9056627    3.4135294    4.5098690    1.6249126
## [226]    2.7804980    3.2296228    1.9336527    7.9649253   12.5841699
## [231]    5.0899922    2.2379486    5.3368401    2.3436934    1.7420621
## [236]    2.8157543    1.4387750    1.8571999    3.4961983    3.7389719
## [241]    2.8357502    4.6999486    3.2925898    2.8595538    6.8921808
## [246]    1.9043079    2.5690652    2.4699129    1.9623987    3.3316314
## [251]    3.6181429    2.5846463    3.3029881    1.5440596    3.1623241
## [256]    2.4231367    2.4466166    1.3566438    3.2627182    2.5958148
## [261]    2.1936039    1.4168219    1.7815376    1.5556389    3.8235453
## [266]    2.6905016    2.9683035    3.8768662    2.1269965    1.7652066
## [271]    1.2261324    1.3882335    4.0067122    2.1624217    2.3688705
## [276]    2.1677589    3.6576346    5.7103506    2.5239538    2.1370961
## [281]    3.1429249    1.2378197    1.6942991    3.7154594    1.7665249
## [286]    1.8318136    7.1635150    1.0855347    2.0048945    1.2521898
## [291]    4.4784983    1.8595823    1.8329521    2.5334187    1.9583700
## [296]    3.6637937    2.1584051    1.8295409    4.0441783    1.1260898
## [301]    1.8069938    2.9500987    2.4660596    1.3082240    1.7598669
## [306]    1.3668529    1.5998933    2.6041545    0.7136022    2.8426739
## [311]    1.9263509    1.2703996    3.6812064    1.3008995    1.6664135
## [316]    3.3832089    1.4627867    3.4932053    1.8401759    2.6107575
## [321]    1.7265176    3.1925716    1.2028047    1.7925209    0.9711582
## [326]    2.4052882    1.4040474    1.5208721    0.8181110    2.7046235
## [331]    3.0417052    1.3560903    2.8157875    1.0381225    5.2992271
## [336]    2.4086212    4.4962079    1.3445884    4.0614032    2.4140260
## [341]    0.8366538    2.1216626    2.5637281    2.1851882    2.2663108
## [346]    1.4130576    1.9744802    2.4486451    4.7282184    1.1728564
## [351]    1.0135909    1.7697420    1.3894022    1.4507574    3.1122158
## [356]    1.7618655    1.3123523    1.2128617    1.6421444    1.5434975
## [361]    1.8326516    3.6378101    1.5358566    3.8486163    1.8863765
## [366]    1.3196631    1.9309178    1.7367908    1.1554598    2.4717763
## [371]    1.6266946    1.5642404    1.6221090    1.2485554    1.5601434
## [376]    1.8255976    0.9761845    1.5479371    1.8484187    1.2483611
## [381]    2.6866403    1.0365166    2.8019063    1.9911157    1.5089319
## [386]    2.3925958    1.5062018    1.2215534    1.9065261    1.2711116
## [391]    1.6105958    1.5843233    1.3697030    1.5212503    3.0643399
## [396]    1.3743428    2.4238683    1.7816216    1.2306241    1.6996059
## [401]    1.8394960    1.6682816    1.4243895    2.1893513    1.9615961
## [406]    2.7719094    1.4685968    1.6734613    2.2953463    1.6696921
## [411]    2.8304658    2.6520751    2.3253428    1.2501434    2.6526685
## [416]    2.3812287    1.8513401    2.5513152    2.5143391    2.5880911
## [421]    2.6372221    2.3188088    1.9510371    2.5832412    2.2589268
## [426]    2.3529869    6.0786728    2.6662312    2.4937127    2.3805265
## [431]    3.5344310    2.8935189    3.4683505    2.2724736

Look for kmers with a phylo signal, using the legit kmers

calc_mutation_rates_abs_random <- function (df_abun) {
  mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
  for (i in 1:ncol(df_abun)) {
    anc_states <- fastAnc(mouse_tree, sample(df_abun[,i]), vars=T, CI=T)
    for (j in 1:nrow(df_abun)) {
      subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
      mu_mx[j,i] <- (abs(df_abun[j,i] - anc_states$ace[subset_table$nearest_node])) / subset_table$gens
    }
  }
  rownames(mu_mx) <- rownames(df_abun)
  colnames(mu_mx) <- colnames(df_abun)
  return(mu_mx)
}

new_rownames <- gsub("/", "", rownames(legit_data))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2

rownames(legit_data) <- new_rownames2

mu_mx_rand_1 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_1 <- calc_norm_mu(legit_data, mu_mx_rand_1)
plot(colMeans(mu_mx_rand_norm_1))

#great, we have a few outliers

sort(colMeans(mu_mx_rand_norm_1), decreasing=T)[1:7]
##          AAAGACAG      AAAGACAGACAG           AGCATGG         AAAGAAAGG 
##       0.044398979       0.025758444       0.010513202       0.005971288 
## AAAGAAAGAAAGAAAGG          AAATGAAT        AAAGAAGAAG 
##       0.005559348       0.004930817       0.004675407
which(colnames(legit_data)=="AAAGACAG")
## [1] 104
which(colnames(legit_data)=="AAAGACAGACAG")
## [1] 27
which(colnames(legit_data)=="AGCATGG")
## [1] 427
which(colnames(legit_data)=="AAAGAAAGG")
## [1] 80
which(colnames(legit_data)=="AAATGAAT")
## [1] 230
which(colnames(legit_data)=="AAGATAGATAGATCT")
## [1] 433
which(colnames(legit_data)=="AAAGAAAGAAAGAAAGG")
## [1] 133
AAAGACAG <- legit_data[,104]
names(AAAGACAG) <- rownames(legit_data)

AAAGACAG
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##        68.345280        57.793219        86.259274        71.408165 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##        49.622705        94.690465         0.000000         0.000000 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         0.000000        81.809260         1.901294        51.787538 
##           B6JEiJ        B10ScNHsd 
##        62.088058         0.000000
obj<-contMap(mouse_tree,AAAGACAG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AAAGACAGACAG <- legit_data[,27]
names(AAAGACAGACAG) <- rownames(legit_data)

obj<-contMap(mouse_tree,AAAGACAGACAG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AAAGACAGACAG
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##       881.309078       698.319519       963.295641       838.750237 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##       739.659689      1001.591049         5.401604         4.415329 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         5.880901       919.069355         8.217183       729.856715 
##           B6JEiJ        B10ScNHsd 
##       843.105703         3.901369
# yes good one
AGCATGG <- legit_data[,427]
names(AGCATGG) <- rownames(legit_data)

obj<-contMap(mouse_tree,AGCATGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AGCATGG
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         4.234116         2.470631         2.183164         1.252682 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         2.135496         2.137580        15.617626        14.967658 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##        17.962139         3.360821        11.100449         1.949454 
##           B6JEiJ        B10ScNHsd 
##         1.188934         9.442596
AAAGAAAGG <- legit_data[,80]
names(AAAGAAAGG) <- rownames(legit_data)

obj<-contMap(mouse_tree,AAAGAAAGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AAAGAAAGG
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##        40.503028        95.985217        10.194145       115.770502 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##       140.923140        50.373755        26.822014        27.493762 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         4.731663       202.989221        24.439834        58.854640 
##           B6JEiJ        B10ScNHsd 
##        95.534331        24.380243
AAATGAAT <- legit_data[,230]
names(AAATGAAT) <- rownames(legit_data)
# not really, just one pair.
obj<-contMap(mouse_tree,AAATGAAT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# no


AAGATAGATAGATCT <- legit_data[,433]
names(AAGATAGATAGATCT) <- rownames(legit_data)
# nothing here
obj<-contMap(mouse_tree,AAGATAGATAGATCT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

AAAGAAAGAAAGAAAGG <- legit_data[,133]
names(AAAGAAAGAAAGAAAGG) <- rownames(legit_data)

obj<-contMap(mouse_tree,AAAGAAAGAAAGAAAGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# do it again and see if you get any additional

mu_mx_rand_2 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_2 <- calc_norm_mu(legit_data, mu_mx_rand_2)
par(mar = c(2, 2, 2, 2))
plot(colMeans(mu_mx_rand_norm_2))

mu_mx_rand_3 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_3 <- calc_norm_mu(legit_data, mu_mx_rand_3)
par(mar = c(2, 2, 2, 2))
plot(colMeans(mu_mx_rand_norm_3))

# nope, no others are having phylo signal

# have 6 kmers with phylogenetic signal

Now complex satellites

complex <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/complex_revised_cn.csv", header=T)
samples <- as.vector(complex$Sample)
complex <- as.matrix(complex[,c(2:3)])
rownames(complex) <- samples
complex
##                   cn_major cn_minor
## B6JEiJ           1048990.8 123362.6
## B10ScNHsd        1060820.1 135657.5
## B6NHsd           1317473.6 129958.4
## B6NTac           1207882.2 136154.8
## B6NJ             1031331.9 129371.0
## B6NTyrCBrdCrlCrl 1239977.1 137196.5
## B6JBomTac         999881.7 130793.7
## B6J              1118980.8 124911.5
## B10ScCr          1068573.0 128685.2
## B10SnJ            923277.8 129303.0
## B6ByJ            1069595.9 125996.3
## B6NCrl            954977.0 123889.9
## B10ScSnJ          971019.3 133243.0
## B10J             1056396.0 143527.2
# visualize:
range(complex[,1])
## [1]  923277.8 1317473.6
# first, major
obj<-contMap(mouse_tree,complex[,1],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# B6NHsd expansion (1317473), B6NJ loss (1031331.9)

complex[,1]
##           B6JEiJ        B10ScNHsd           B6NHsd           B6NTac 
##        1048990.8        1060820.1        1317473.6        1207882.2 
##             B6NJ B6NTyrCBrdCrlCrl        B6JBomTac              B6J 
##        1031331.9        1239977.1         999881.7        1118980.8 
##          B10ScCr           B10SnJ            B6ByJ           B6NCrl 
##        1068573.0         923277.8        1069595.9         954977.0 
##         B10ScSnJ             B10J 
##         971019.3        1056396.0
anc_states <- fastAnc(mouse_tree, complex[,1], vars=T, CI=T)
anc_states 
## Ancestral character estimates using fastAnc:
##      15      16      17      18      19      20      21      22      23 
## 1045425 1009114 1014464 1027204 1045385 1081737 1098129 1109461 1157943 
##      24      25      26      27 
## 1161155 1147246 1065387 1062584 
## 
## Variances on ancestral states:
##          15          16          17          18          19          20 
## 16995854715  5103246361  4400057772  4106422853  3438407147  4700132074 
##          21          22          23          24          25          26 
##  3277108126  2784220764  2344618027  1996686318  1609537905  3474444128 
##          27 
##  2832675811 
## 
## Lower & upper 95% CIs:
##        lower   upper
## 15  789903.6 1300947
## 16  869097.3 1149130
## 17  884451.9 1144477
## 18  901604.9 1152804
## 19  930455.1 1160316
## 20  947363.8 1216109
## 21  985926.4 1210331
## 22 1006040.4 1212882
## 23 1063037.1 1252848
## 24 1073573.6 1248736
## 25 1068612.2 1225879
## 26  949856.0 1180918
## 27  958267.5 1166901
#major satellite at ancestor of B6NHsd and B6NJ: 1157943
# gained by B6NHsd:
1317473-1157943 #159,530 copies 
## [1] 159530
# lost by B6NJ
1157943-1031331.9 # 126,611 copies lost
## [1] 126611.1
# second, minor
obj<-contMap(mouse_tree,complex[,2],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

complex[,2] # b6NHsd: 129958.4 , B6NJ: 129371
##           B6JEiJ        B10ScNHsd           B6NHsd           B6NTac 
##         123362.6         135657.5         129958.4         136154.8 
##             B6NJ B6NTyrCBrdCrlCrl        B6JBomTac              B6J 
##         129371.0         137196.5         130793.7         124911.5 
##          B10ScCr           B10SnJ            B6ByJ           B6NCrl 
##         128685.2         129303.0         125996.3         123889.9 
##         B10ScSnJ             B10J 
##         133243.0         143527.2
anc_states <- fastAnc(mouse_tree, complex[,2], vars=T, CI=T)

# 130624.7 
# how many copies and kb did these lines lose and gain?
130624.7 -129958.4 # B6NHsd : lost 666 copies
## [1] 666.3
130624.7 - 129371 # B6NJ : lost 1253 copies
## [1] 1253.7
mu_complex <- calc_mutation_rates(complex)
#mu_complex

mu_complex_norm <- calc_norm_mu(complex, mu_complex)


mu_complex_abs <- calc_mutation_rates_abs(complex)
mean(mu_complex_abs[,2])
## [1] 26.05974
mu_complex_abs_norm <- calc_norm_mu(complex, mu_complex_abs)
#mu_complex_abs_norm

colMeans(mu_complex_abs)
##  cn_major  cn_minor 
## 559.13985  26.05974
colMeans(mu_complex_abs_norm)
##     cn_major     cn_minor 
## 0.0005004309 0.0001976812
#major: 4.8 E-4, minor: 1.9 E-4
# major satellite has a higher mutation rate than minor, even after normalizing
# might be because of increased recombination -- more distal to the centromere

Finally, make a correlation plot of kmer changes

library(corrplot)

#http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software

combined_mu_norm <- cbind(mu_complex_norm, mu_common_norm)
combined_mu_norm[,1:10]
##                       cn_major      cn_minor            AC            AG
## B6JEiJ           -9.160690e-05 -1.615809e-04 -1.665755e-04  4.075430e-04
## B10ScNHsd         1.069893e-04 -7.387808e-05 -1.037055e-04  5.307961e-04
## B6NHsd            9.983401e-04 -3.696561e-05 -1.520792e-04 -1.564207e-03
## B6NTac            3.907011e-04  2.792860e-04  4.070876e-05  1.091716e-03
## B6NJ             -1.578695e-03 -4.101223e-04 -1.399663e-04  4.652882e-04
## B6NTyrCBrdCrlCrl  1.262965e-03  5.102066e-04  4.700042e-04 -4.409409e-04
## B6JBomTac        -5.087033e-04  2.367385e-04  1.526023e-04  1.838519e-04
## B6J               4.575415e-04 -1.616087e-04  9.140961e-05  4.974917e-05
## B10ScCr           1.728454e-04 -1.568226e-04  2.490011e-04 -7.222850e-04
## B10SnJ           -2.893223e-04 -8.429768e-05  4.230222e-04 -5.365049e-04
## B6ByJ            -1.255219e-04 -8.906739e-05 -3.731076e-04  3.595034e-04
## B6NCrl           -7.866809e-04 -2.184930e-04 -1.704128e-04  9.820323e-05
## B10ScSnJ         -1.597972e-04  6.259891e-06 -2.642960e-04 -6.374373e-05
## B10J              7.632285e-05  3.422101e-04 -9.510261e-05  6.792768e-05
##                            AAG        AACCCT          AGAT        AGAGGC
## B6JEiJ           -0.0016936235 -0.0004374261 -0.0036641401 -3.605975e-04
## B10ScNHsd         0.0010510781 -0.0008913244  0.0015920712 -3.000287e-04
## B6NHsd           -0.0033459144 -0.0009501471 -0.0043442036  2.057965e-06
## B6NTac            0.0012808578 -0.0011192028  0.0009625905  2.326053e-04
## B6NJ              0.0004670982 -0.0021052905  0.0017003030 -2.519822e-05
## B6NTyrCBrdCrlCrl -0.0009093677  0.0033453313 -0.0017989410  2.586653e-04
## B6JBomTac         0.0005484810  0.0011256591  0.0001359405  2.273234e-04
## B6J               0.0005808924  0.0004389475  0.0012644830  9.134611e-05
## B10ScCr          -0.0020978667  0.0001943518 -0.0025471971  2.915746e-04
## B10SnJ            0.0016646539  0.0027878050  0.0037813664  5.069733e-04
## B6ByJ             0.0007397267 -0.0014161398  0.0005660927 -4.350530e-04
## B6NCrl            0.0005409763  0.0001317758  0.0005682900 -1.447509e-04
## B10ScSnJ          0.0002449262 -0.0012163915 -0.0002705450 -2.517623e-04
## B10J              0.0002107909 -0.0005693525  0.0004712985 -1.601563e-04
##                           AAAG           AGG
## B6JEiJ           -1.514244e-03 -3.295951e-04
## B10ScNHsd         1.043085e-03 -1.410237e-04
## B6NHsd           -2.464712e-03 -6.330540e-04
## B6NTac            1.582728e-03 -4.349942e-05
## B6NJ              8.803666e-04 -3.848845e-04
## B6NTyrCBrdCrlCrl -1.402117e-03  7.395840e-04
## B6JBomTac        -1.279014e-06  2.846829e-04
## B6J               4.982097e-04  1.979003e-04
## B10ScCr          -1.573901e-03  3.587638e-04
## B10SnJ            5.719602e-04  1.111715e-03
## B6ByJ             4.967773e-04 -9.523429e-05
## B6NCrl            6.199326e-04 -1.910845e-04
## B10ScSnJ          3.594212e-04 -2.978637e-04
## B10J              4.671663e-04 -7.151320e-04
cor_mu <- cor(combined_mu_norm)
par(mar=c(3,3,10,2))
corrplot(cor_mu, type="upper", order="original", tl.col="black", tl.cex=0.5, tl.srt=90)

# relate copy number changes to sequence variability in the complex satellites

variability <- read.csv(file="~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/variation_centromere_sats.csv", header = T)
variability
##    File              Line error_rate_major error_rate_minor   mu_major
## 1  R104           BL6NTac        0.0356000        0.0448000   453.6645
## 2  R114           BL6NCrl        0.0355000        0.0446000  -872.7919
## 3  R124           BL6NHsd        0.0352000        0.0447000  1156.0206
## 4  R134 BL6NTyrCBrdCrlCrl        0.0354000        0.0448231  1448.9304
## 5  R144        BL6JBomTac        0.0358362        0.0447000  -540.5402
## 6    R2              BL6J        0.0350000        0.0444000   486.1765
## 7   R22         BL10ScSnJ        0.0365000        0.0441000  -162.1085
## 8   R32           BL10SnJ        0.0353000        0.0423000  -291.9592
## 9   R42          BL10ScCr        0.0349000        0.0421000   177.5475
## 10  R52             BL6NJ        0.0352000        0.0443000 -1811.1508
## 11  R62             BL10J        0.0358000        0.0431000    79.7868
## 12  R72            BL6ByJ        0.0355000        0.0447000  -137.8392
## 13  R84           BL6JEiJ        0.0357000        0.0448000   -97.5968
## 14  R94        BL10ScNHsd        0.0357000        0.0431000   111.8450
##       mu_minor
## 1   36.9628491
## 2  -28.1580321
## 3   -4.8286223
## 4   67.7851609
## 5   30.1363125
## 6  -20.5724470
## 7    0.8326899
## 8  -11.1769438
## 9  -20.9461098
## 10 -54.4881282
## 11  46.9015225
## 12 -11.4329535
## 13 -20.4892330
## 14 -10.1253442
ggplot(data = variability, aes(x = mu_major, y = error_rate_major)) + 
  geom_point() +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))

ggplot(data = variability, aes(x = mu_minor, y = error_rate_minor)) + 
  geom_point() +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))

Test for phylogenetic signal with the package phylosig

data_subset <- data_sorted_common[,5]
names(data_subset) <- rownames(data_sorted_common)

test_phlosig <- phylosig(mouse_tree, data_subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)


data_subset <- data_sorted_common[,27]
names(data_subset) <- rownames(data_sorted_common)

test_phlosig <- phylosig(mouse_tree, data_subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
test_phlosig$P
## [1] 0.001
phylosig_test <- c()
for (i in 1:ncol(legit_data)) {
  subset <- legit_data[,i]
  names(subset) <- rownames(legit_data)
  test_phlosig <- phylosig(mouse_tree, subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
  phylosig_test <- c(phylosig_test, test_phlosig$P)
  
}

possibilities <- which(phylosig_test < 0.05)

filtered_possibilities <- c()
for (i in possibilities) {
  prop <- min(legit_data[,i])/max(legit_data[,i])
  diff <- max(legit_data[,i]) - min(legit_data[,i])
  if (prop < 0.3 & diff > 15){
    filtered_possibilities <- c(filtered_possibilities, i)
  }
}

filtered_possibilities # this is pretty good. But for some reason 80 isn't showing up
## [1]  27  71  72 104 349 427
# the ones in the table:
which(colnames(legit_data)=="AAAGACAGACAG") #27
## [1] 27
which(colnames(legit_data)=="AAAGACAG") #104
## [1] 104
which(colnames(legit_data)=="AGCATGG") # 427
## [1] 427
which(colnames(legit_data)=="AAAGAAAGG") #80
## [1] 80
which(colnames(legit_data)=="AGGGC") #71
## [1] 71
which(colnames(legit_data)=="AAGAAGAAGAGG") #72
## [1] 72
min(legit_data[,27])
## [1] 3.901369
# check all these

phylosig_test[1:10]
##  [1] 0.124 0.628 0.343 0.443 0.627 0.105 0.515 0.183 0.589 0.492
phylosig_test[80] # why not significant? this one looks legit.
## [1] 0.439
colnames(legit_data)[15]
## [1] "ACAG"
check  <- legit_data[,15] 
names(check) <- rownames(legit_data)
check
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##        1623.5399        1521.7861        1250.1318        2013.6151 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##        1680.3956        1627.5711        1148.3129        1168.5650 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##         685.9261        1637.6058        1138.5821        1530.7700 
##           B6JEiJ        B10ScNHsd 
##        1546.1709        1123.8266
obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

colnames(legit_data)[71]
## [1] "AGGGC"
check_2  <- legit_data[,71]
names(check_2) <- rownames(legit_data)
check_2
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##         56.22773         77.83892         82.72548         86.91108 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##         51.54837         53.20102         84.08393        189.24518 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##        107.44587         88.20542         99.42593         64.28542 
##           B6JEiJ        B10ScNHsd 
##         57.50664         86.89554
obj<-contMap(mouse_tree,check_2,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

colnames(legit_data)[72]
## [1] "AAGAAGAAGAGG"
check_3  <- legit_data[,72]
names(check_3) <- rownames(legit_data)

obj<-contMap(mouse_tree,check_3,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

colnames(legit_data)[80]
## [1] "AAAGAAAGG"
check  <- legit_data[,80]
names(check) <- rownames(legit_data)

obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

#check

# want to apply some sort of filtering - requiring a difference of x%?
colnames(legit_data)[349]
## [1] "AAAGAAAGGAAGGAAGG"
check  <- legit_data[,349]
names(check) <- rownames(legit_data)

obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

# this one is good
check
##           B6NTac           B6NCrl           B6NHsd B6NTyrCBrdCrlCrl 
##        7.6277800        9.2428867        2.3568429        8.5065947 
##        B6JBomTac              B6J         B10ScSnJ           B10SnJ 
##       13.2646907       17.5286549        9.6558086       11.3011399 
##          B10ScCr             B6NJ             B10J            B6ByJ 
##        2.5862785        9.5971337        8.6801232        0.1576554 
##           B6JEiJ        B10ScNHsd 
##       14.2165408        9.8961555

Now, look at the Ymin satellite

Ymin <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/Ymin_cn.csv", header=T)
samples <- as.vector(Ymin$Sample)
Ymin <- as.matrix(Ymin$cn_HOR_Ymin)
rownames(Ymin) <- samples
Ymin
##                      [,1]
## B6JEiJ           28.63662
## B10ScNHsd        28.77115
## B6NHsd           21.89659
## B6NTac           28.92109
## B6NJ             20.58688
## B6NTyrCBrdCrlCrl 30.30322
## B6JBomTac        29.80160
## B6J              26.39369
## B10ScCr          20.03651
## B10SnJ           26.36346
## B6ByJ            27.44637
## B6NCrl           28.15001
## B10ScSnJ         25.73320
## B10J             28.53049
colnames(Ymin) <- "cn_HOR_Ymin"

# visualize:
range(Ymin[,1])
## [1] 20.03651 30.30322
# first, major
obj<-contMap(mouse_tree,Ymin[,1],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
    fsize=c(0.7,0.9))

anc_states <- fastAnc(mouse_tree, Ymin[,1], vars=T, CI=T)
anc_states 
## Ancestral character estimates using fastAnc:
##       15       16       17       18       19       20       21       22 
## 26.44703 25.73685 25.63808 25.49412 27.32168 27.15721 26.81652 26.57562 
##       23       24       25       26       27 
## 25.92030 26.34371 25.85353 27.87069 27.97749 
## 
## Variances on ancestral states:
##        15        16        17        18        19        20        21 
## 23.326382  7.004077  6.038968  5.635962  4.719127  6.450813  4.497748 
##        22        23        24        25        26        27 
##  3.821273  3.217929  2.740402  2.209050  4.768587  3.887776 
## 
## Lower & upper 95% CIs:
##       lower    upper
## 15 16.98074 35.91332
## 16 20.54967 30.92403
## 17 20.82152 30.45465
## 18 20.84104 30.14719
## 19 23.06386 31.57950
## 20 22.17911 32.13530
## 21 22.65978 30.97327
## 22 22.74420 30.40705
## 23 22.40433 29.43626
## 24 23.09909 29.58832
## 25 22.94041 28.76665
## 26 23.59062 32.15076
## 27 24.11287 31.84211
mu_Ymin <- calc_mutation_rates(Ymin)
#mu_complex
mu_Ymin
##                    cn_HOR_Ymin
## B6JEiJ            0.0045591078
## B10ScNHsd         0.0105033902
## B6NHsd           -0.0291572766
## B6NTac            0.0250231070
## B6NJ             -0.0822913756
## B6NTyrCBrdCrlCrl  0.0695263271
## B6JBomTac         0.0157250869
## B6J              -0.0136534962
## B10ScCr          -0.0234232196
## B10SnJ            0.0021313171
## B6ByJ             0.0030427341
## B6NCrl            0.0088948335
## B10ScSnJ          0.0003549335
## B10J              0.0087594744
mean(mu_Ymin)
## [1] -3.611772e-07
mu_Ymin_norm <- calc_norm_mu(Ymin, mu_Ymin)

mu_Ymin_norm
##                    cn_HOR_Ymin
## B6JEiJ            1.635807e-04
## B10ScNHsd         3.844343e-04
## B6NHsd           -1.124882e-03
## B6NTac            9.498703e-04
## B6NJ             -3.182984e-03
## B6NTyrCBrdCrlCrl  2.689239e-03
## B6JBomTac         5.620620e-04
## B6J              -4.880171e-04
## B10ScCr          -9.187696e-04
## B10SnJ            8.281188e-05
## B6ByJ             1.134649e-04
## B6NCrl            3.346990e-04
## B10ScSnJ          1.384399e-05
## B10J              3.206053e-04
# absolute mutation rate

mu_Ymin_abs <- calc_mutation_rates_abs(Ymin)
mean(mu_Ymin_abs)
## [1] 0.02121755
range(mu_Ymin_abs)
## [1] 0.0003549335 0.0822913756
mu_Ymin_abs_norm <- calc_norm_mu(Ymin, mu_Ymin_abs)
mean(mu_Ymin_abs_norm)
## [1] 0.0008092332
mu_complex_norm_all <- cbind(mu_complex_norm, mu_Ymin_norm)

hypermutator_df$type <- "simple"
head(hypermutator_df)
##         line_names kmer_names abun_hypermut   type
## 1           B6NTac         AC -1.665755e-04 simple
## 2           B6NCrl         AC -1.037055e-04 simple
## 3           B6NHsd         AC -1.520792e-04 simple
## 4 B6NTyrCBrdCrlCrl         AC  4.070876e-05 simple
## 5        B6JBomTac         AC -1.399663e-04 simple
## 6              B6J         AC  4.700042e-04 simple
abun_complex <- c()
for (j in 1:ncol(mu_complex_norm_all)) {
  for (i in 1:nrow(mu_complex_norm_all)) {
    abun_complex <- c(abun_complex, mu_complex_norm_all[i,j])
  }
}

line_names <- c(rep(rownames(mu_complex_norm_all), times = ncol(mu_complex_norm_all) ))

kmer_names <- c()
for (i in 1:ncol(mu_complex_norm_all)) {
  kmer_names <- c(kmer_names, c(rep(colnames(mu_complex_norm_all)[i], times = nrow(mu_complex_norm_all))))
}

#now make a dataframe

complex_df <- data.frame(line_names, kmer_names, abun_complex)
complex_df$type <- "complex"
head(complex_df)
##         line_names kmer_names  abun_complex    type
## 1           B6JEiJ   cn_major -0.0000916069 complex
## 2        B10ScNHsd   cn_major  0.0001069893 complex
## 3           B6NHsd   cn_major  0.0009983401 complex
## 4           B6NTac   cn_major  0.0003907011 complex
## 5             B6NJ   cn_major -0.0015786951 complex
## 6 B6NTyrCBrdCrlCrl   cn_major  0.0012629646 complex
colnames(complex_df) <- c("line_names", "kmer_names", "abun", "type")
colnames(hypermutator_df) <- c("line_names", "kmer_names", "abun", "type")

perline_boxplot_df <- rbind(hypermutator_df, complex_df)

#perline_boxplot_df$line_names <- factor(perline_boxplot_df$line_names, levels = c(""))
## make the plot including mu_complex
ggplot(data = perline_boxplot_df, aes(x = line_names, y = abun)) + 
  geom_boxplot(outlier.shape = NA, width=0.7) +
  #geom_point(position = position_jitter(w = 0.5, h = 0, alpha = 0.6, color=type)) +
  geom_jitter(aes(color=type, alpha=0.6)) +
  theme_bw() +
  theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))

* Investigate the correlation between kmers *

library(corrplot)

mu_complex_norm2 <- cbind(mu_complex_norm, mu_Ymin_norm)


combined_mu_norm <- cbind(mu_complex_norm2, mu_common_norm)
combined_mu_norm[,1:10]
##                       cn_major      cn_minor   cn_HOR_Ymin            AC
## B6JEiJ           -9.160690e-05 -1.615809e-04  1.635807e-04 -1.665755e-04
## B10ScNHsd         1.069893e-04 -7.387808e-05  3.844343e-04 -1.037055e-04
## B6NHsd            9.983401e-04 -3.696561e-05 -1.124882e-03 -1.520792e-04
## B6NTac            3.907011e-04  2.792860e-04  9.498703e-04  4.070876e-05
## B6NJ             -1.578695e-03 -4.101223e-04 -3.182984e-03 -1.399663e-04
## B6NTyrCBrdCrlCrl  1.262965e-03  5.102066e-04  2.689239e-03  4.700042e-04
## B6JBomTac        -5.087033e-04  2.367385e-04  5.620620e-04  1.526023e-04
## B6J               4.575415e-04 -1.616087e-04 -4.880171e-04  9.140961e-05
## B10ScCr           1.728454e-04 -1.568226e-04 -9.187696e-04  2.490011e-04
## B10SnJ           -2.893223e-04 -8.429768e-05  8.281188e-05  4.230222e-04
## B6ByJ            -1.255219e-04 -8.906739e-05  1.134649e-04 -3.731076e-04
## B6NCrl           -7.866809e-04 -2.184930e-04  3.346990e-04 -1.704128e-04
## B10ScSnJ         -1.597972e-04  6.259891e-06  1.384399e-05 -2.642960e-04
## B10J              7.632285e-05  3.422101e-04  3.206053e-04 -9.510261e-05
##                             AG           AAG        AACCCT          AGAT
## B6JEiJ            4.075430e-04 -0.0016936235 -0.0004374261 -0.0036641401
## B10ScNHsd         5.307961e-04  0.0010510781 -0.0008913244  0.0015920712
## B6NHsd           -1.564207e-03 -0.0033459144 -0.0009501471 -0.0043442036
## B6NTac            1.091716e-03  0.0012808578 -0.0011192028  0.0009625905
## B6NJ              4.652882e-04  0.0004670982 -0.0021052905  0.0017003030
## B6NTyrCBrdCrlCrl -4.409409e-04 -0.0009093677  0.0033453313 -0.0017989410
## B6JBomTac         1.838519e-04  0.0005484810  0.0011256591  0.0001359405
## B6J               4.974917e-05  0.0005808924  0.0004389475  0.0012644830
## B10ScCr          -7.222850e-04 -0.0020978667  0.0001943518 -0.0025471971
## B10SnJ           -5.365049e-04  0.0016646539  0.0027878050  0.0037813664
## B6ByJ             3.595034e-04  0.0007397267 -0.0014161398  0.0005660927
## B6NCrl            9.820323e-05  0.0005409763  0.0001317758  0.0005682900
## B10ScSnJ         -6.374373e-05  0.0002449262 -0.0012163915 -0.0002705450
## B10J              6.792768e-05  0.0002107909 -0.0005693525  0.0004712985
##                         AGAGGC          AAAG
## B6JEiJ           -3.605975e-04 -1.514244e-03
## B10ScNHsd        -3.000287e-04  1.043085e-03
## B6NHsd            2.057965e-06 -2.464712e-03
## B6NTac            2.326053e-04  1.582728e-03
## B6NJ             -2.519822e-05  8.803666e-04
## B6NTyrCBrdCrlCrl  2.586653e-04 -1.402117e-03
## B6JBomTac         2.273234e-04 -1.279014e-06
## B6J               9.134611e-05  4.982097e-04
## B10ScCr           2.915746e-04 -1.573901e-03
## B10SnJ            5.069733e-04  5.719602e-04
## B6ByJ            -4.350530e-04  4.967773e-04
## B6NCrl           -1.447509e-04  6.199326e-04
## B10ScSnJ         -2.517623e-04  3.594212e-04
## B10J             -1.601563e-04  4.671663e-04
cor_mu <- cor(combined_mu_norm)
par(mar=c(3,3,10,2))
corrplot(cor_mu, type="upper", order="original", tl.col="black", tl.cex=0.5, tl.srt=90)

#install.packages("Hmisc")
library("Hmisc")
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:Biostrings':
## 
##     mask, translate
## The following object is masked from 'package:ape':
## 
##     zoom
## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

res2<-rcorr(combined_mu_norm)
res2.2 <- flattenCorrMatrix(res2$r, res2$P)
head(res2.2)
##           row      column       cor            p
## 1    cn_major    cn_minor 0.5961939 0.0244345812
## 2    cn_major cn_HOR_Ymin 0.5728026 0.0322751053
## 3    cn_minor cn_HOR_Ymin 0.7871867 0.0008331837
## 4    cn_major          AC 0.3384863 0.2365039854
## 5    cn_minor          AC 0.4020968 0.1540876291
## 6 cn_HOR_Ymin          AC 0.3735314 0.1883233036
sig_cor <- subset(res2.2, p<0.05)

nrow(res2.2)
## [1] 2145
nrow(sig_cor)
## [1] 609
nrow(sig_cor[which(sig_cor$cor > 0), ])
## [1] 475
nrow(sig_cor[which(sig_cor$cor < 0), ])
## [1] 134
strong_cor <- subset(res2.2, cor>0.95)
strong_neg_cor <- subset(res2.2, cor<(-0.9))

nrow(sig_cor)
## [1] 609
sig_cor
##                      row              column        cor            p
## 1               cn_major            cn_minor  0.5961939 2.443458e-02
## 2               cn_major         cn_HOR_Ymin  0.5728026 3.227511e-02
## 3               cn_minor         cn_HOR_Ymin  0.7871867 8.331837e-04
## 15                    AG                 AAG  0.6775671 7.755117e-03
## 18           cn_HOR_Ymin              AACCCT  0.5924363 2.558601e-02
## 19                    AC              AACCCT  0.8693569 5.385384e-05
## 27                   AAG                AGAT  0.9429372 4.404587e-07
## 32                    AC              AGAGGC  0.8894076 2.073145e-05
## 35                AACCCT              AGAGGC  0.6663045 9.269577e-03
## 41                    AG                AAAG  0.7544191 1.821581e-03
## 42                   AAG                AAAG  0.9429756 4.387182e-07
## 44                  AGAT                AAAG  0.8874420 2.294166e-05
## 49                    AC                 AGG  0.8429168 1.532526e-04
## 52                AACCCT                 AGG  0.8373350 1.865758e-04
## 54                AGAGGC                 AGG  0.7269247 3.225056e-03
## 69           cn_HOR_Ymin              AAGGAG  0.5433964 4.460971e-02
## 71                    AG              AAGGAG  0.5969839 2.419754e-02
## 72                   AAG              AAGGAG  0.7995362 5.990438e-04
## 74                  AGAT              AAGGAG  0.6770462 7.820620e-03
## 76                  AAAG              AAGGAG  0.6188191 1.830330e-02
## 77                   AGG              AAGGAG  0.6431434 1.309396e-02
## 83                    AG                AAGG  0.8536113 1.028616e-04
## 84                   AAG                AAGG  0.8355912 1.981108e-04
## 86                  AGAT                AAGG  0.7092056 4.506670e-03
## 88                  AAAG                AAGG  0.7855805 8.683671e-04
## 91                AAGGAG                AAGG  0.8616187 7.474152e-05
## 95                    AC                 ACC  0.7646111 1.446958e-03
## 98                AACCCT                 ACC  0.8665906 6.068786e-05
## 100               AGAGGC                 ACC  0.6491285 1.200722e-02
## 102                  AGG                 ACC  0.9168749 3.974093e-06
## 109                   AC                ATCC  0.7771135 1.074017e-03
## 112               AACCCT                ATCC  0.7878102 8.198365e-04
## 113                 AGAT                ATCC  0.5527153 4.038039e-02
## 114               AGAGGC                ATCC  0.6698154 8.774933e-03
## 116                  AGG                ATCC  0.9325039 1.178976e-06
## 118               AAGGAG                ATCC  0.6961116 5.686714e-03
## 120                  ACC                ATCC  0.8815212 3.079216e-05
## 125                   AG               AAAGG  0.8640291 6.763051e-05
## 126                  AAG               AAAGG  0.8948035 1.554274e-05
## 128                 AGAT               AAAGG  0.7591261 1.640045e-03
## 130                 AAAG               AAAGG  0.8452743 1.407147e-04
## 133               AAGGAG               AAAGG  0.8298590 2.401628e-04
## 134                 AAGG               AAAGG  0.9659691 2.083762e-08
## 141                   AG                ACAG  0.8440128 1.473157e-04
## 142                  AAG                ACAG  0.5771353 3.069849e-02
## 146                 AAAG                ACAG  0.6633226 9.706363e-03
## 149               AAGGAG                ACAG  0.5460859 4.335732e-02
## 150                 AAGG                ACAG  0.6929443 6.005353e-03
## 153                AAAGG                ACAG  0.6919266 6.110638e-03
## 154             cn_major AAGTCTGCACACACATACT  0.5574111 3.836459e-02
## 157                   AC AAGTCTGCACACACATACT  0.8138235 3.973446e-04
## 158                   AG AAGTCTGCACACACATACT -0.6530048 1.134122e-02
## 160               AACCCT AAGTCTGCACACACATACT  0.9066425 7.796126e-06
## 162               AGAGGC AAGTCTGCACACACATACT  0.6767620 7.856546e-03
## 164                  AGG AAGTCTGCACACACATACT  0.7636160 1.480572e-03
## 168                  ACC AAGTCTGCACACACATACT  0.8603169 7.882775e-05
## 169                 ATCC AAGTCTGCACACACATACT  0.6407927 1.354079e-02
## 176                   AG                ACCT  0.8952618 1.515641e-05
## 177                  AAG                ACCT  0.8958921 1.463790e-05
## 179                 AGAT                ACCT  0.7367723 2.648840e-03
## 181                 AAAG                ACCT  0.8836579 2.773969e-05
## 184               AAGGAG                ACCT  0.8226237 3.032524e-04
## 185                 AAGG                ACCT  0.9209232 2.971879e-06
## 188                AAAGG                ACCT  0.9484069 2.435194e-07
## 189                 ACAG                ACCT  0.8299220 2.396642e-04
## 193          cn_HOR_Ymin              ACACAG  0.5625084 3.626140e-02
## 194                   AC              ACACAG  0.8124835 4.135323e-04
## 197               AACCCT              ACACAG  0.7079737 4.608720e-03
## 199               AGAGGC              ACACAG  0.7499037 2.010374e-03
## 201                  AGG              ACACAG  0.7924395 7.260019e-04
## 205                  ACC              ACACAG  0.6464933 1.247683e-02
## 206                 ATCC              ACACAG  0.5919067 2.575151e-02
## 209  AAGTCTGCACACACATACT              ACACAG  0.6963836 5.659978e-03
## 216                  AAG                  AT  0.8678601 5.746876e-05
## 218                 AGAT                  AT  0.9335224 1.078596e-06
## 220                 AAAG                  AT  0.7113482 4.333413e-03
## 223               AAGGAG                  AT  0.7371431 2.628854e-03
## 224                 AAGG                  AT  0.6745085 8.145893e-03
## 226                 ATCC                  AT  0.7349683 2.747793e-03
## 227                AAAGG                  AT  0.7231566 3.469960e-03
## 230                 ACCT                  AT  0.6255011 1.673849e-02
## 232             cn_major               AAGAG -0.5415476 4.548580e-02
## 236                   AG               AAGAG  0.8241880 2.885918e-04
## 237                  AAG               AAGAG  0.9416123 5.040257e-07
## 239                 AGAT               AAGAG  0.8551208 9.699140e-05
## 241                 AAAG               AAGAG  0.8966696 1.401845e-05
## 244               AAGGAG               AAGAG  0.7995170 5.993607e-04
## 245                 AAGG               AAGAG  0.9294186 1.531169e-06
## 248                AAAGG               AAGAG  0.9757621 2.778423e-09
## 249                 ACAG               AAGAG  0.6358095 1.452646e-02
## 251                 ACCT               AAGAG  0.9370874 7.809699e-07
## 253                   AT               AAGAG  0.8055010 5.067089e-04
## 258                   AG               AAGGG  0.5466281 4.310799e-02
## 266               AAGGAG               AAGGG  0.5684020 3.393717e-02
## 267                 AAGG               AAGGG  0.6538637 1.119757e-02
## 271                 ACAG               AAGGG  0.6072499 2.127173e-02
## 274               ACACAG               AAGGG  0.6271593 1.636644e-02
## 285               AGAGGC            AGAGAGGC  0.6353183 1.462651e-02
## 297               ACACAG            AGAGAGGC  0.7173135 3.878414e-03
## 300                AAGGG            AGAGAGGC  0.7838064 9.086015e-04
## 306                  AAG              ACATAT  0.8447661 1.433445e-04
## 308                 AGAT              ACATAT  0.8424598 1.557848e-04
## 310                 AAAG              ACATAT  0.7633388 1.490043e-03
## 313               AAGGAG              ACATAT  0.7917486 7.394321e-04
## 314                 AAGG              ACATAT  0.6992994 5.379474e-03
## 316                 ATCC              ACATAT  0.5815215 2.916145e-02
## 317                AAAGG              ACATAT  0.6805089 7.393133e-03
## 320                 ACCT              ACATAT  0.7220530 3.544392e-03
## 322                   AT              ACATAT  0.7745860 1.142399e-03
## 323                AAGAG              ACATAT  0.7141328 4.116093e-03
## 337                ACAGC ACAGACAGCACAGCACAGC  0.9932645 1.328715e-12
## 353             cn_minor                 AGC  0.6080687 2.105044e-02
## 354          cn_HOR_Ymin                 AGC  0.5487262 4.215305e-02
## 382                   AC               AGAGG  0.5781589 3.033451e-02
## 384                  AAG               AGAGG  0.7480044 2.094276e-03
## 386                 AGAT               AGAGG  0.7893651 7.872956e-04
## 387               AGAGGC               AGAGG  0.5708973 3.298711e-02
## 388                 AAAG               AGAGG  0.6466368 1.245089e-02
## 389                  AGG               AGAGG  0.5952743 2.471271e-02
## 391               AAGGAG               AGAGG  0.7680748 1.334663e-03
## 392                 AAGG               AGAGG  0.7991167 6.060123e-04
## 394                 ATCC               AGAGG  0.7420271 2.376562e-03
## 395                AAAGG               AGAGG  0.7047049 4.888267e-03
## 398                 ACCT               AGAGG  0.6545014 1.109183e-02
## 400                   AT               AGAGG  0.7760032 1.103643e-03
## 401                AAGAG               AGAGG  0.7281660 3.147433e-03
## 402                AAGGG               AGAGG  0.6328730 1.513241e-02
## 404               ACATAT               AGAGG  0.8173272 3.574155e-04
## 436             cn_major   AAAAACAAGGGAGATAT  0.5407722 4.585699e-02
## 440                   AG   AAAAACAAGGGAGATAT -0.9375549 7.475579e-07
## 441                  AAG   AAAAACAAGGGAGATAT -0.7855231 8.696458e-04
## 443                 AGAT   AAAAACAAGGGAGATAT -0.6308160 1.556820e-02
## 445                 AAAG   AAAAACAAGGGAGATAT -0.8331100 2.155119e-04
## 448               AAGGAG   AAAAACAAGGGAGATAT -0.6015439 2.286288e-02
## 449                 AAGG   AAAAACAAGGGAGATAT -0.8679448 5.725925e-05
## 452                AAAGG   AAAAACAAGGGAGATAT -0.9343189 1.005095e-06
## 453                 ACAG   AAAAACAAGGGAGATAT -0.7646264 1.446444e-03
## 454  AAGTCTGCACACACATACT   AAAAACAAGGGAGATAT  0.6316706 1.538601e-02
## 455                 ACCT   AAAAACAAGGGAGATAT -0.8996703 1.182541e-05
## 458                AAGAG   AAAAACAAGGGAGATAT -0.9112247 5.823148e-06
## 466             cn_major                AGGC  0.5469852 4.294434e-02
## 467             cn_minor                AGGC  0.5649979 3.526571e-02
## 468          cn_HOR_Ymin                AGGC  0.6404235 1.361201e-02
## 469                   AC                AGGC  0.5374873 4.745394e-02
## 470                   AG                AGGC -0.5500716 4.154896e-02
## 472               AACCCT                AGGC  0.7835755 9.139447e-04
## 480                  ACC                AGGC  0.6854910 6.810130e-03
## 484  AAGTCTGCACACACATACT                AGGC  0.8259637 2.726498e-04
## 508                ACAGC      ACAGACAGCACAGC  0.9944554 4.145573e-13
## 523  ACAGACAGCACAGCACAGC      ACAGACAGCACAGC  0.9981677 4.440892e-16
## 553             AGAGAGGC                AACC  0.6477331 1.225416e-02
## 565                   AC               AAAAG -0.6445830 1.282595e-02
## 568               AACCCT               AAAAG -0.8319098 2.243619e-04
## 572                  AGG               AAAAG -0.7348410 2.754883e-03
## 576                  ACC               AAAAG -0.8445019 1.447274e-04
## 577                 ATCC               AAAAG -0.8182549 3.474049e-04
## 580  AAGTCTGCACACACATACT               AAAAG -0.7349647 2.747993e-03
## 583                   AT               AAAAG -0.5905381 2.618288e-02
## 593                 AGGC               AAAAG -0.6866443 6.680409e-03
## 600                   AG              AAAGAG  0.6005715 2.314273e-02
## 601                  AAG              AAAGAG  0.9750127 3.329960e-09
## 603                 AGAT              AAAGAG  0.9664049 1.930525e-08
## 605                 AAAG              AAAGAG  0.8978024 1.315450e-05
## 608               AAGGAG              AAAGAG  0.7679822 1.337571e-03
## 609                 AAGG              AAAGAG  0.7839200 9.059815e-04
## 611                 ATCC              AAAGAG  0.5368995 4.774390e-02
## 612                AAAGG              AAAGAG  0.8509897 1.137408e-04
## 615                 ACCT              AAAGAG  0.8410475 1.638249e-04
## 617                   AT              AAAGAG  0.9183877 3.571354e-06
## 618                AAGAG              AAAGAG  0.9283130 1.676735e-06
## 621               ACATAT              AAAGAG  0.7952075 6.741465e-04
## 624                AGAGG              AAAGAG  0.7455215 2.208108e-03
## 626    AAAAACAAGGGAGATAT              AAAGAG -0.7285579 3.123237e-03
## 636                  AAG              AGATAT  0.7239813 3.415155e-03
## 638                 AGAT              AGATAT  0.7883346 8.087444e-04
## 640                 AAAG              AGATAT  0.5990864 2.357512e-02
## 643               AAGGAG              AGATAT  0.6167043 1.882094e-02
## 645                  ACC              AGATAT  0.5530898 4.021686e-02
## 646                 ATCC              AGATAT  0.5878097 2.705898e-02
## 652                   AT              AGATAT  0.8009872 5.754294e-04
## 653                AAGAG              AGATAT  0.5679404 3.411511e-02
## 656               ACATAT              AGATAT  0.7413573 2.409980e-03
## 666               AAAGAG              AGATAT  0.7675353 1.351680e-03
## 670                   AC              AGCAGG  0.9276230 1.773226e-06
## 671                   AG              AGCAGG -0.5706962 3.306297e-02
## 673               AACCCT              AGCAGG  0.9018867 1.039328e-05
## 675               AGAGGC              AGCAGG  0.8240837 2.895517e-04
## 677                  AGG              AGCAGG  0.8125791 4.123605e-04
## 681                  ACC              AGCAGG  0.8641457 6.730102e-05
## 682                 ATCC              AGCAGG  0.7542010 1.830363e-03
## 685  AAGTCTGCACACACATACT              AGCAGG  0.9269506 1.871623e-06
## 687               ACACAG              AGCAGG  0.7016500 5.161359e-03
## 698                 AGGC              AGCAGG  0.7202911 3.665806e-03
## 701                AAAAG              AGCAGG -0.7187996 3.771126e-03
## 709                  AAG              ACAGAT  0.8741481 4.350596e-05
## 711                 AGAT              ACAGAT  0.8487740 1.236480e-04
## 713                 AAAG              ACAGAT  0.6843178 6.944081e-03
## 714                  AGG              ACAGAT  0.6126558 1.984267e-02
## 716               AAGGAG              ACAGAT  0.8925574 1.755491e-05
## 717                 AAGG              ACAGAT  0.7845245 8.921394e-04
## 719                 ATCC              ACAGAT  0.7605322 1.588702e-03
## 720                AAAGG              ACAGAT  0.8168332 3.628408e-04
## 723                 ACCT              ACAGAT  0.7569590 1.721745e-03
## 725                   AT              ACAGAT  0.9304533 1.404532e-06
## 726                AAGAG              ACAGAT  0.8587379 8.402620e-05
## 729               ACATAT              ACAGAT  0.7411472 2.420537e-03
## 732                AGAGG              ACAGAT  0.7729729 1.187828e-03
## 734    AAAAACAAGGGAGATAT              ACAGAT -0.6012655 2.294275e-02
## 738                AAAAG              ACAGAT -0.5874160 2.718718e-02
## 739               AAAGAG              ACAGAT  0.9172323 3.875745e-06
## 740               AGATAT              ACAGAT  0.7346771 2.764035e-03
## 745                   AC              AAAGGG  0.5871851 2.726261e-02
## 747                  AAG              AAAGGG  0.6119585 2.002281e-02
## 748               AACCCT              AAAGGG  0.6040670 2.214864e-02
## 749                 AGAT              AAAGGG  0.5410206 4.573782e-02
## 752                  AGG              AAAGGG  0.8404142 1.675382e-04
## 754               AAGGAG              AAAGGG  0.9223098 2.680798e-06
## 755                 AAGG              AAAGGG  0.6496743 1.191166e-02
## 756                  ACC              AAAGGG  0.6598834 1.022965e-02
## 757                 ATCC              AAAGGG  0.7880314 8.151430e-04
## 758                AAAGG              AAAGGG  0.5782123 3.031561e-02
## 761                 ACCT              AAAGGG  0.6242475 1.702403e-02
## 762               ACACAG              AAAGGG  0.7192244 3.740889e-03
## 763                   AT              AAAGGG  0.6274207 1.630836e-02
## 764                AAGAG              AAAGGG  0.5723129 3.245700e-02
## 765                AAGGG              AAAGGG  0.5605297 3.706746e-02
## 767               ACATAT              AAAGGG  0.7068125 4.706549e-03
## 770                AGAGG              AAAGGG  0.6930870 5.990707e-03
## 776                AAAAG              AAAGGG -0.5648877 3.530935e-02
## 777               AAAGAG              AAAGGG  0.6226334 1.739708e-02
## 778               AGATAT              AAAGGG  0.6402170 1.365196e-02
## 780               ACAGAT              AAAGGG  0.8067144 4.894099e-04
## 785                   AG              AACAAG  0.6144484 1.938522e-02
## 786                  AAG              AACAAG  0.8438155 1.483706e-04
## 788                 AGAT              AACAAG  0.7266262 3.243948e-03
## 790                 AAAG              AACAAG  0.7047577 4.883648e-03
## 793               AAGGAG              AACAAG  0.8672127 5.909245e-05
## 794                 AAGG              AACAAG  0.8316526 2.262956e-04
## 796                 ATCC              AACAAG  0.5616071 3.662695e-02
## 797                AAAGG              AACAAG  0.8778710 3.663900e-05
## 800                 ACCT              AACAAG  0.7959779 6.602561e-04
## 802                   AT              AACAAG  0.7856132 8.676403e-04
## 803                AAGAG              AACAAG  0.8483200 1.257619e-04
## 806               ACATAT              AACAAG  0.6941714 5.880298e-03
## 809                AGAGG              AACAAG  0.6391303 1.386374e-02
## 811    AAAAACAAGGGAGATAT              AACAAG -0.7042463 4.928523e-03
## 816               AAAGAG              AACAAG  0.8107855 4.348054e-04
## 817               AGATAT              AACAAG  0.6386746 1.395328e-02
## 819               ACAGAT              AACAAG  0.8664931 6.094094e-05
## 820               AAAGGG              AACAAG  0.6787491 7.608062e-03
## 832                ACAGC              AAAAAG  0.5827540 2.874011e-02
## 847  ACAGACAGCACAGCACAGC              AAAAAG  0.5930627 2.539130e-02
## 853       ACAGACAGCACAGC              AAAAAG  0.5978586 2.393712e-02
## 865                   AC             AGGAGGC  0.7933153 7.092612e-04
## 866                   AG             AGGAGGC -0.6038391 2.221245e-02
## 868               AACCCT             AGGAGGC  0.8417580 1.597394e-04
## 870               AGAGGC             AGGAGGC  0.7396344 2.497651e-03
## 872                  AGG             AGGAGGC  0.8189203 3.403647e-04
## 876                  ACC             AGGAGGC  0.9263603 1.961665e-06
## 877                 ATCC             AGGAGGC  0.8143158 3.915267e-04
## 879                 ACAG             AGGAGGC -0.5632206 3.597447e-02
## 880  AAGTCTGCACACACATACT             AGGAGGC  0.8643840 6.663179e-05
## 882               ACACAG             AGGAGGC  0.5368358 4.777543e-02
## 893                 AGGC             AGGAGGC  0.6661019 9.298769e-03
## 896                AAAAG             AGGAGGC -0.7875134 8.261672e-04
## 899               AGCAGG             AGGAGGC  0.9277953 1.748718e-06
## 907                   AC                AAGC  0.5468783 4.299327e-02
## 929               ACATAT                AAGC  0.5886910 2.677364e-02
## 932                AGAGG                AAGC  0.6586665 1.041991e-02
## 945               AAAAAG                AAGC -0.6964421 5.654243e-03
## 951                   AG              AGAGGG  0.7524754 1.901042e-03
## 953               AACCCT              AGAGGG -0.5985924 2.372026e-02
## 961                  ACC              AGAGGG -0.6890528 6.415719e-03
## 964                 ACAG              AGAGGG  0.6819058 7.225906e-03
## 965  AAGTCTGCACACACATACT              AGAGGG -0.7118596 4.292847e-03
## 970                AAGGG              AGAGGG  0.5353358 4.852159e-02
## 977    AAAAACAAGGGAGATAT              AGAGGG -0.6405265 1.359210e-02
## 978                 AGGC              AGAGGG -0.7371386 2.629101e-03
## 981                AAAAG              AGAGGG  0.6078034 2.112194e-02
## 984               AGCAGG              AGAGGG -0.6226417 1.739514e-02
## 989              AGGAGGC              AGAGGG -0.6992378 5.385288e-03
## 994                   AC      AGAGAGGCAGAGGC  0.6359067 1.450674e-02
## 999               AGAGGC      AGAGAGGCAGAGGC  0.7073080 4.664606e-03
## 1011              ACACAG      AGAGAGGCAGAGGC  0.7547257 1.809294e-03
## 1014               AAGGG      AGAGAGGCAGAGGC  0.7830560 9.260614e-04
## 1015            AGAGAGGC      AGAGAGGCAGAGGC  0.8991328 1.219603e-05
## 1016              ACATAT      AGAGAGGCAGAGGC  0.5445738 4.405824e-02
## 1019               AGAGG      AGAGAGGCAGAGGC  0.6400548 1.368341e-02
## 1024                AACC      AGAGAGGCAGAGGC  0.6283393 1.610553e-02
## 1034                AAGC      AGAGAGGCAGAGGC  0.6763580 7.907825e-03
## 1039                  AC               AAAGC -0.5354158 4.848161e-02
## 1042              AACCCT               AAAGC -0.7733716 1.176466e-03
## 1046                 AGG               AAAGC -0.7933512 7.085817e-04
## 1050                 ACC               AAAGC -0.9368645 7.973279e-07
## 1051                ATCC               AAAGC -0.7580672 1.679572e-03
## 1054 AAGTCTGCACACACATACT               AAAGC -0.8000113 5.912269e-04
## 1067                AGGC               AAAGC -0.7378471 2.591239e-03
## 1070               AAAAG               AAAGC  0.8263103 2.696227e-04
## 1072              AGATAT               AAAGC -0.5795735 2.983683e-02
## 1073              AGCAGG               AAAGC -0.7001747 5.297430e-03
## 1075              AAAGGG               AAAGC -0.5941224 2.506443e-02
## 1078             AGGAGGC               AAAGC -0.8031011 5.423710e-04
## 1080              AGAGGG               AAAGC  0.7590949 1.641196e-03
## 1085                  AC              ACAGGC  0.8826218 2.918734e-05
## 1088              AACCCT              ACAGGC  0.8910874 1.898370e-05
## 1090              AGAGGC              ACAGGC  0.8170589 3.603539e-04
## 1092                 AGG              ACAGGC  0.9314514 1.290681e-06
## 1096                 ACC              ACAGGC  0.9257018 2.066297e-06
## 1097                ATCC              ACAGGC  0.8311918 2.297953e-04
## 1100 AAGTCTGCACACACATACT              ACAGGC  0.8999953 1.160587e-05
## 1102              ACACAG              ACAGGC  0.7744655 1.145743e-03
## 1113                AGGC              ACAGGC  0.6088313 2.084589e-02
## 1116               AAAAG              ACAGGC -0.7690186 1.305303e-03
## 1119              AGCAGG              ACAGGC  0.9092562 6.613143e-06
## 1121              AAAGGG              ACAGGC  0.6449852 1.275183e-02
## 1124             AGGAGGC              ACAGGC  0.9127137 5.278559e-06
## 1128               AAAGC              ACAGGC -0.7912819 7.486156e-04
## 1132                  AC     AAACAAGGGAGATAT  0.6084426 2.094998e-02
## 1133                  AG     AAACAAGGGAGATAT -0.8091731 4.558124e-04
## 1135              AACCCT     AAACAAGGGAGATAT  0.6221313 1.751438e-02
## 1137              AGAGGC     AAACAAGGGAGATAT  0.6020888 2.270720e-02
## 1138                AAAG     AAACAAGGGAGATAT -0.5355607 4.840920e-02
## 1139                 AGG     AAACAAGGGAGATAT  0.6367221 1.434199e-02
## 1143                 ACC     AAACAAGGGAGATAT  0.7933097 7.093683e-04
## 1145               AAAGG     AAACAAGGGAGATAT -0.6055257 2.174342e-02
## 1146                ACAG     AAACAAGGGAGATAT -0.6655918 9.372561e-03
## 1147 AAGTCTGCACACACATACT     AAACAAGGGAGATAT  0.8456324 1.388854e-04
## 1148                ACCT     AAACAAGGGAGATAT -0.5752445 3.137931e-02
## 1149              ACACAG     AAACAAGGGAGATAT  0.5656577 3.500526e-02
## 1151               AAGAG     AAACAAGGGAGATAT -0.5477510 4.259498e-02
## 1159   AAAAACAAGGGAGATAT     AAACAAGGGAGATAT  0.8375645 1.850990e-04
## 1160                AGGC     AAACAAGGGAGATAT  0.5969336 2.421259e-02
## 1166              AGCAGG     AAACAAGGGAGATAT  0.7976856 6.302822e-04
## 1171             AGGAGGC     AAACAAGGGAGATAT  0.8220631 3.086498e-04
## 1173              AGAGGG     AAACAAGGGAGATAT -0.7457768 2.196186e-03
## 1175               AAAGC     AAACAAGGGAGATAT -0.7332943 2.842199e-03
## 1176              ACAGGC     AAACAAGGGAGATAT  0.7826228 9.362629e-04
## 1210                AACC              ACAGAG -0.6311454 1.549779e-02
## 1246              ACACAG                ACAT  0.6103133 2.045271e-02
## 1249               AAGGG                ACAT  0.6254250 1.675572e-02
## 1250            AGAGAGGC                ACAT  0.8015132 5.670560e-04
## 1251              ACATAT                ACAT  0.6028462 2.249210e-02
## 1271      AGAGAGGCAGAGGC                ACAT  0.6745660 8.138411e-03
## 1281                 AAG                 ACT  0.7468601 2.146144e-03
## 1283                AGAT                 ACT  0.7689318 1.307979e-03
## 1286                 AGG                 ACT  0.6367597 1.433443e-02
## 1288              AAGGAG                 ACT  0.7841892 8.997957e-04
## 1289                AAGG                 ACT  0.6578432 1.055017e-02
## 1290                 ACC                 ACT  0.5833720 2.853053e-02
## 1291                ATCC                 ACT  0.8015557 5.663847e-04
## 1292               AAAGG                 ACT  0.7058145 4.791924e-03
## 1295                ACCT                 ACT  0.5833842 2.852641e-02
## 1297                  AT                 ACT  0.9178403 3.713009e-06
## 1298               AAGAG                 ACT  0.7436363 2.297772e-03
## 1301              ACATAT                 ACT  0.6075221 2.119797e-02
## 1304               AGAGG                 ACT  0.6992488 5.384248e-03
## 1310               AAAAG                 ACT -0.7020574 5.124267e-03
## 1311              AAAGAG                 ACT  0.8238590 2.916269e-04
## 1312              AGATAT                 ACT  0.6581630 1.049943e-02
## 1314              ACAGAT                 ACT  0.9571777 8.115607e-08
## 1315              AAAGGG                 ACT  0.7140482 4.122566e-03
## 1316              AACAAG                 ACT  0.7989392 6.089814e-04
## 1322               AAAGC                 ACT -0.5488524 4.209615e-02
## 1332                 AAG              AGAGAT  0.8505713 1.155604e-04
## 1334                AGAT              AGAGAT  0.8985417 1.261457e-05
## 1336                AAAG              AGAGAT  0.6793555 7.533466e-03
## 1337                 AGG              AGAGAT  0.5603979 3.712159e-02
## 1339              AAGGAG              AGAGAT  0.7214123 3.588170e-03
## 1340                AAGG              AGAGAT  0.5775631 3.054595e-02
## 1341                 ACC              AGAGAT  0.5742354 3.174723e-02
## 1342                ATCC              AGAGAT  0.7500078 2.005853e-03
## 1343               AAAGG              AGAGAT  0.6407042 1.355783e-02
## 1346                ACCT              AGAGAT  0.5915087 2.587640e-02
## 1348                  AT              AGAGAT  0.9577740 7.470393e-08
## 1349               AAGAG              AGAGAT  0.7342569 2.787603e-03
## 1352              ACATAT              AGAGAT  0.7493438 2.034826e-03
## 1355               AGAGG              AGAGAT  0.6866842 6.675962e-03
## 1361               AAAAG              AGAGAT -0.6021450 2.269117e-02
## 1362              AAAGAG              AGAGAT  0.9019679 1.034362e-05
## 1363              AGATAT              AGAGAT  0.8797358 3.354841e-05
## 1365              ACAGAT              AGAGAT  0.9135069 5.006033e-06
## 1366              AAAGGG              AGAGAT  0.6635262 9.676048e-03
## 1367              AACAAG              AGAGAT  0.7853398 8.737399e-04
## 1373               AAAGC              AGAGAT -0.5718297 3.263726e-02
## 1378                 ACT              AGAGAT  0.8795980 3.376915e-05
## 1402               AAGGG                AGGG  0.6876542 6.568411e-03
## 1403            AGAGAGGC                AGGG  0.5434225 4.459741e-02
## 1410                AGGC                AGGG -0.6049457 2.190387e-02
## 1423              AGAGGG                AGGG  0.7898654 7.770501e-04
## 1425               AAAGC                AGGG  0.5403205 4.607420e-02
## 1432            cn_major              AAAAAC  0.8440494 1.471210e-04
## 1434         cn_HOR_Ymin              AAAAAC  0.5728243 3.226707e-02
## 1436                  AG              AAAAAC -0.6460600 1.255536e-02
## 1437                 AAG              AAAAAC -0.6357100 1.454670e-02
## 1439                AGAT              AAAAAC -0.6769117 7.837603e-03
## 1441                AAAG              AAAAAC -0.7442898 2.266370e-03
## 1445                AAGG              AAAAAC -0.5868516 2.737177e-02
## 1448               AAAGG              AAAAAC -0.6474621 1.230257e-02
## 1450 AAGTCTGCACACACATACT              AAAAAC  0.7008881 5.231285e-03
## 1451                ACCT              AAAAAC -0.6108335 2.031602e-02
## 1453                  AT              AAAAAC -0.5355513 4.841387e-02
## 1454               AAGAG              AAAAAC -0.7430779 2.324876e-03
## 1459                 AGC              AAAAAC  0.5606600 3.701396e-02
## 1462   AAAAACAAGGGAGATAT              AAAAAC  0.7523407 1.906649e-03
## 1463                AGGC              AAAAAC  0.6790759 7.567792e-03
## 1467              AAAGAG              AAAAAC -0.6919680 6.106326e-03
## 1476              AGAGGG              AAAAAC -0.6077218 2.114398e-02
## 1480     AAACAAGGGAGATAT              AAAAAC  0.6278539 1.621246e-02
## 1490                  AG              AAGAGG  0.9128313 5.237424e-06
## 1491                 AAG              AAGAGG  0.8298571 2.401773e-04
## 1493                AGAT              AAGAGG  0.6549791 1.101312e-02
## 1495                AAAG              AAGAGG  0.7979150 6.263407e-04
## 1498              AAGGAG              AAGAGG  0.8577825 8.730489e-05
## 1499                AAGG              AAGAGG  0.9728022 5.511222e-09
## 1502               AAAGG              AAGAGG  0.9607164 4.874380e-08
## 1503                ACAG              AAGAGG  0.8047296 5.179592e-04
## 1505                ACCT              AAGAGG  0.9617456 4.165895e-08
## 1507                  AT              AAGAGG  0.5910708 2.601435e-02
## 1508               AAGAG              AAGAGG  0.9157132 4.308092e-06
## 1509               AAGGG              AAGAGG  0.6324843 1.521404e-02
## 1511              ACATAT              AAGAGG  0.6932488 5.974129e-03
## 1514               AGAGG              AAGAGG  0.6880932 6.520177e-03
## 1516   AAAAACAAGGGAGATAT              AAGAGG -0.8971486 1.364762e-05
## 1521              AAAGAG              AAGAGG  0.7555373 1.777079e-03
## 1524              ACAGAT              AAGAGG  0.7327439 2.873789e-03
## 1525              AAAGGG              AAGAGG  0.6403938 1.361775e-02
## 1526              AACAAG              AAGAGG  0.8141717 3.932227e-04
## 1534     AAACAAGGGAGATAT              AAGAGG -0.5873821 2.719827e-02
## 1537                 ACT              AAGAGG  0.5698911 3.336781e-02
## 1541            cn_major            AAAGGAAG -0.5516407 4.085245e-02
## 1545                  AG            AAAGGAAG  0.8562712 9.270294e-05
## 1546                 AAG            AAAGGAAG  0.9138849 4.880266e-06
## 1548                AGAT            AAAGGAAG  0.8045665 5.203635e-04
## 1550                AAAG            AAAGGAAG  0.9006218 1.119201e-05
## 1553              AAGGAG            AAAGGAAG  0.7321563 2.907825e-03
## 1554                AAGG            AAAGGAAG  0.9103519 6.163259e-06
## 1557               AAAGG            AAAGGAAG  0.9760673 2.576718e-09
## 1558                ACAG            AAAGGAAG  0.6524151 1.144066e-02
## 1560                ACCT            AAAGGAAG  0.9206025 3.042773e-06
## 1562                  AT            AAAGGAAG  0.7439232 2.283945e-03
## 1563               AAGAG            AAAGGAAG  0.9819305 4.833791e-10
## 1566              ACATAT            AAAGGAAG  0.6583388 1.047161e-02
## 1569               AGAGG            AAAGGAAG  0.6397617 1.374038e-02
## 1571   AAAAACAAGGGAGATAT            AAAGGAAG -0.9541723 1.211222e-07
## 1576              AAAGAG            AAAGGAAG  0.8770980 3.798652e-05
## 1579              ACAGAT            AAAGGAAG  0.7813765 9.661139e-04
## 1581              AACAAG            AAAGGAAG  0.8389189 1.765755e-04
## 1589     AAACAAGGGAGATAT            AAAGGAAG -0.6578506 1.054900e-02
## 1592                 ACT            AAAGGAAG  0.6699531 8.755969e-03
## 1593              AGAGAT            AAAGGAAG  0.6585840 1.043291e-02
## 1595              AAAAAC            AAAGGAAG -0.7395101 2.504071e-03
## 1596              AAGAGG            AAAGGAAG  0.9109297 5.936315e-06
## 1608               ACAGC      AAAAATCTTAAAGG -0.6256377 1.670762e-02
## 1609              AAGGAG      AAAAATCTTAAAGG -0.5603924 3.712387e-02
## 1623 ACAGACAGCACAGCACAGC      AAAAATCTTAAAGG -0.6290862 1.594203e-02
## 1629      ACAGACAGCACAGC      AAAAATCTTAAAGG -0.6154109 1.914291e-02
## 1633              AGATAT      AAAAATCTTAAAGG -0.5666919 3.459988e-02
## 1635              ACAGAT      AAAAATCTTAAAGG -0.6205377 1.789067e-02
## 1636              AAAGGG      AAAAATCTTAAAGG -0.6170336 1.873962e-02
## 1637              AACAAG      AAAAATCTTAAAGG -0.5636608 3.579795e-02
## 1643               AAAGC      AAAAATCTTAAAGG  0.5909849 2.604145e-02
## 1648                 ACT      AAAAATCTTAAAGG -0.5772176 3.066908e-02
## 1649              AGAGAT      AAAAATCTTAAAGG -0.5947339 2.487727e-02
## 1657                  AC              ACTGCT  0.6427693 1.316431e-02
## 1660              AACCCT              ACTGCT  0.7191759 3.744335e-03
## 1662              AGAGGC              ACTGCT  0.5907612 2.611221e-02
## 1664                 AGG              ACTGCT  0.6649561 9.465163e-03
## 1668                 ACC              ACTGCT  0.7696240 1.286740e-03
## 1669                ATCC              ACTGCT  0.7640271 1.466608e-03
## 1672 AAGTCTGCACACACATACT              ACTGCT  0.7096391 4.471186e-03
## 1685                AGGC              ACTGCT  0.6611117 1.004032e-02
## 1688               AAAAG              ACTGCT -0.7672257 1.361526e-03
## 1691              AGCAGG              ACTGCT  0.7913762 7.467516e-04
## 1696             AGGAGGC              ACTGCT  0.8736008 4.459848e-05
## 1700               AAAGC              ACTGCT -0.6921888 6.083380e-03
## 1701              ACAGGC              ACTGCT  0.7267566 3.235687e-03
## 1702     AAACAAGGGAGATAT              ACTGCT  0.5855584 2.779824e-02
## 1705                 ACT              ACTGCT  0.5613406 3.673551e-02
## 1715                  AC              ACACAT  0.6348328 1.472592e-02
## 1718              AACCCT              ACACAT  0.8134960 4.012531e-04
## 1722                 AGG              ACACAT  0.7911380 7.514649e-04
## 1726                 ACC              ACACAT  0.8901559 1.993727e-05
## 1727                ATCC              ACACAT  0.8666711 6.047954e-05
## 1730 AAGTCTGCACACACATACT              ACACAT  0.6926040 6.040406e-03
## 1733                  AT              ACACAT  0.6657221 9.353679e-03
## 1743                AGGC              ACACAT  0.6208681 1.781215e-02
## 1746               AAAAG              ACACAT -0.8640085 6.768878e-05
## 1748              AGATAT              ACACAT  0.5629358 3.608900e-02
## 1749              AGCAGG              ACACAT  0.7263320 3.262647e-03
## 1750              ACAGAT              ACACAT  0.6281278 1.615205e-02
## 1751              AAAGGG              ACACAT  0.5785985 3.017920e-02
## 1754             AGGAGGC              ACACAT  0.8273850 2.604092e-04
## 1756              AGAGGG              ACACAT -0.6295187 1.584793e-02
## 1758               AAAGC              ACACAT -0.8583684 8.528209e-05
## 1759              ACAGGC              ACACAT  0.7511447 1.957010e-03
## 1760     AAACAAGGGAGATAT              ACACAT  0.5395026 4.646948e-02
## 1763                 ACT              ACACAT  0.7530063 1.879073e-03
## 1764              AGAGAT              ACACAT  0.6650002 9.458719e-03
## 1770              ACTGCT              ACACAT  0.7971878 6.389041e-04
## 1782               ACAGC               AAACC -0.8505582 1.156178e-04
## 1797 ACAGACAGCACAGCACAGC               AAACC -0.8788170 3.504344e-05
## 1803      ACAGACAGCACAGC               AAACC -0.8832293 2.833114e-05
## 1812              AAAAAG               AAACC -0.6172248 1.869251e-02
## 1829              ACTGCT               AAACC  0.5559035 3.900351e-02
## 1835                  AG              AAAACC -0.7745204 1.144220e-03
## 1840                AAAG              AAAACC -0.6039374 2.218490e-02
## 1844                AAGG              AAAACC -0.6064134 2.149961e-02
## 1845                 ACC              AAAACC  0.6890482 6.416214e-03
## 1847               AAAGG              AAAACC -0.6588306 1.039409e-02
## 1848                ACAG              AAAACC -0.6303054 1.567784e-02
## 1849 AAGTCTGCACACACATACT              AAAACC  0.7560369 1.757473e-03
## 1850                ACCT              AAAACC -0.5950317 2.478647e-02
## 1853               AAGAG              AAAACC -0.5976837 2.398901e-02
## 1861   AAAAACAAGGGAGATAT              AAAACC  0.8380955 1.817188e-04
## 1868              AGCAGG              AAAACC  0.6296777 1.581343e-02
## 1873             AGGAGGC              AAAACC  0.6758703 7.970071e-03
## 1875              AGAGGG              AAAACC -0.7065394 4.729791e-03
## 1877               AAAGC              AAAACC -0.6845233 6.920471e-03
## 1878              ACAGGC              AAAACC  0.6854976 6.809375e-03
## 1879     AAACAAGGGAGATAT              AAAACC  0.9464340 3.037009e-07
## 1885              AAAAAC              AAAACC  0.6440681 1.292133e-02
## 1886              AAGAGG              AAAACC -0.6212649 1.771821e-02
## 1887            AAAGGAAG              AAAACC -0.6953682 5.760291e-03
## 1892            cn_major             AAAAACC  0.6687758 8.919233e-03
## 1896                  AG             AAAAACC -0.7098716 4.452249e-03
## 1897                 AAG             AAAAACC -0.7919462 7.355705e-04
## 1899                AGAT             AAAAACC -0.7275218 3.187534e-03
## 1901                AAAG             AAAAACC -0.8412995 1.623661e-04
## 1905                AAGG             AAAAACC -0.6938952 5.908266e-03
## 1908               AAAGG             AAAAACC -0.8093634 4.532910e-04
## 1909                ACAG             AAAAACC -0.5925099 2.556307e-02
## 1910 AAGTCTGCACACACATACT             AAAAACC  0.5877763 2.706986e-02
## 1911                ACCT             AAAAACC -0.7471097 2.134747e-03
## 1913                  AT             AAAAACC -0.6117614 2.007395e-02
## 1914               AAGAG             AAAAACC -0.8338486 2.102076e-04
## 1922   AAAAACAAGGGAGATAT             AAAAACC  0.8896429 2.047902e-05
## 1927              AAAGAG             AAAAACC -0.7590166 1.644097e-03
## 1930              ACAGAT             AAAAACC -0.5725235 3.237870e-02
## 1932              AACAAG             AAAAACC -0.5750942 3.143389e-02
## 1933              AAAAAG             AAAAACC  0.5590630 3.767342e-02
## 1940     AAACAAGGGAGATAT             AAAAACC  0.7352132 2.734193e-03
## 1946              AAAAAC             AAAAACC  0.8175790 3.546762e-04
## 1947              AAGAGG             AAAAACC -0.6889266 6.429382e-03
## 1948            AAAGGAAG             AAAAACC -0.8791965 3.441943e-05
## 1953              AAAACC             AAAAACC  0.7857744 8.640576e-04
## 1958                  AG            ACACATAT  0.6297079 1.580690e-02
## 1959                 AAG            ACACATAT  0.9565039 8.899533e-08
## 1961                AGAT            ACACATAT  0.9389898 6.522895e-07
## 1963                AAAG            ACACATAT  0.8898077 2.030374e-05
## 1966              AAGGAG            ACACATAT  0.7701492 1.270810e-03
## 1967                AAGG            ACACATAT  0.8668021 6.014169e-05
## 1969                ATCC            ACACATAT  0.5536394 3.997771e-02
## 1970               AAAGG            ACACATAT  0.8937335 1.647629e-05
## 1973                ACCT            ACACATAT  0.8246451 2.844179e-04
## 1975                  AT            ACACATAT  0.8982272 1.284200e-05
## 1976               AAGAG            ACACATAT  0.9307724 1.367266e-06
## 1979              ACATAT            ACACATAT  0.8223294 3.060758e-04
## 1982               AGAGG            ACACATAT  0.8498136 1.189161e-04
## 1984   AAAAACAAGGGAGATAT            ACACATAT -0.7712784 1.237089e-03
## 1989              AAAGAG            ACACATAT  0.9341650 1.018970e-06
## 1990              AGATAT            ACACATAT  0.6348444 1.472354e-02
## 1992              ACAGAT            ACACATAT  0.8603236 7.880632e-05
## 1993              AAAGGG            ACACATAT  0.5583915 3.795327e-02
## 1994              AACAAG            ACACATAT  0.8228387 3.012022e-04
## 1995              AAAAAG            ACACATAT -0.5606745 3.700800e-02
## 1997                AAGC            ACACATAT  0.5327225 4.984181e-02
## 2005                 ACT            ACACATAT  0.7731975 1.181418e-03
## 2006              AGAGAT            ACACATAT  0.8268936 2.645899e-04
## 2008              AAAAAC            ACACATAT -0.6666895 9.214307e-03
## 2009              AAGAGG            ACACATAT  0.8068933 4.868995e-04
## 2010            AAAGGAAG            ACACATAT  0.9039513 9.190343e-06
## 2016             AAAAACC            ACACATAT -0.8069037 4.867536e-04
## 2020                  AC        AGCAGGAGGAGG  0.6411136 1.347910e-02
## 2023              AACCCT        AGCAGGAGGAGG  0.5498888 4.163065e-02
## 2025              AGAGGC        AGCAGGAGGAGG  0.6740693 8.203241e-03
## 2027                 AGG        AGCAGGAGGAGG  0.6112526 2.020644e-02
## 2035 AAGTCTGCACACACATACT        AGCAGGAGGAGG  0.6263436 1.654868e-02
## 2037              ACACAG        AGCAGGAGGAGG  0.8404333 1.674247e-04
## 2041            AGAGAGGC        AGCAGGAGGAGG  0.6273619 1.632142e-02
## 2054              AGCAGG        AGCAGGAGGAGG  0.5843175 2.821209e-02
## 2064              ACAGGC        AGCAGGAGGAGG  0.6872426 6.613884e-03
## 2065     AAACAAGGGAGATAT        AGCAGGAGGAGG  0.5464055 4.321024e-02
## 2079             AAAAACC        AGCAGGAGGAGG  0.5412812 4.561309e-02
## 2086                 AAG              ACATAG  0.8623949 7.238826e-05
## 2088                AGAT              ACATAG  0.8279815 2.554058e-04
## 2090                AAAG              ACATAG  0.6902402 6.288259e-03
## 2091                 AGG              ACATAG  0.6091706 2.075538e-02
## 2093              AAGGAG              ACATAG  0.9352899 9.211397e-07
## 2094                AAGG              ACATAG  0.8005357 5.826957e-04
## 2096                ATCC              ACATAG  0.7216642 3.570910e-03
## 2097               AAAGG              ACATAG  0.7968310 6.451415e-04
## 2100                ACCT              ACATAG  0.7638464 1.472733e-03
## 2102                  AT              ACATAG  0.8893262 2.081936e-05
## 2103               AAGAG              ACATAG  0.8235778 2.942407e-04
## 2106              ACATAT              ACATAG  0.8778584 3.666065e-05
## 2109               AGAGG              ACATAG  0.7925484 7.239044e-04
## 2111   AAAAACAAGGGAGATAT              ACATAG -0.5698117 3.339801e-02
## 2115               AAAAG              ACATAG -0.5438580 4.439291e-02
## 2116              AAAGAG              ACATAG  0.8587177 8.409459e-05
## 2117              AGATAT              ACATAG  0.7956712 6.657581e-04
## 2119              ACAGAT              ACATAG  0.9289660 1.589445e-06
## 2120              AAAGGG              ACATAG  0.8508024 1.145527e-04
## 2121              AACAAG              ACATAG  0.8504493 1.160956e-04
## 2132                 ACT              ACATAG  0.8424517 1.558300e-04
## 2133              AGAGAT              ACATAG  0.8525818 1.070280e-04
## 2136              AAGAGG              ACATAG  0.7781731 1.046341e-03
## 2137            AAAGGAAG              ACATAG  0.7576165 1.696622e-03
## 2140              ACACAT              ACATAG  0.6048381 2.193375e-02
## 2144            ACACATAT              ACATAG  0.8480556 1.270067e-04
# very high sig: AAG            ACACATAT  0.9565039 8.899533e-08
#ACAGC ACAGACAGCACAGCACAGC  0.9932645 1.328715e-12
# AAAGG            AAAGGAAG  0.9760673 2.576718e-09
#AGAT              AAAGAG  0.9664049 1.930525e-08
# ACAGACAGCACAGCACAGC      ACAGACAGCACAGC  0.9981677 4.440892e-16
# ACAGC      ACAGACAGCACAGC  0.9944554 4.145573e-13
# AAAGG               AAGAG  0.9757621 2.778423e-09
# AAG                AAAG  0.9429756 4.387182e-07

# complex_repeats:
# cn_major cn_minor  0.596193948 0.02443458
# cn_minor  AGC  0.6080686725 0.02105044
# cn_minor   AGGC  0.5649979280 0.03526571
# cn_major  AAAAACC  0.66877575 0.008919233
# cn_major    AAAGGAAG -0.55164066 0.040852452
# cn_major  AAAAAC  0.84404938 0.000147121
# cn_major    AGGC  0.54698523 0.042944336
# cn_major   AAAAACAAGGGAGATAT  0.54077219 0.045856990
# cn_HOR_Ymin   AACCCT  0.59243629 0.02558601
# cn_HOR_Ymin     AAGGAG  0.54339639 0.04460971
# cn_HOR_Ymin    ACACAG  0.56250836 0.03626140
# cn_HOR_Ymin        AGC  0.54872625 0.04215305
# cn_HOR_Ymin    AGGC  0.64042345 0.01361201
# cn_HOR_Ymin   AAAAAC  0.57282428 0.03226707
# cn_major    cn_HOR_Ymin  0.57280260 0.032275105
# cn_minor    cn_HOR_Ymin  0.7871866529 0.0008331837

res2.2[which(res2.2$row == "cn_minor"),]
##           row              column           cor            p
## 3    cn_minor         cn_HOR_Ymin  0.7871866529 0.0008331837
## 5    cn_minor                  AC  0.4020967811 0.1540876291
## 8    cn_minor                  AG  0.0024032620 0.9934943573
## 12   cn_minor                 AAG  0.0288507825 0.9220082684
## 17   cn_minor              AACCCT  0.4394568064 0.1158949115
## 23   cn_minor                AGAT -0.1118349440 0.7034760976
## 30   cn_minor              AGAGGC  0.2716551453 0.3474689709
## 38   cn_minor                AAAG -0.0701922507 0.8115384962
## 47   cn_minor                 AGG  0.1795982604 0.5389698138
## 57   cn_minor               ACAGC  0.0772760679 0.7928783960
## 68   cn_minor              AAGGAG  0.2923860323 0.3103906534
## 80   cn_minor                AAGG  0.1174742496 0.6891878515
## 93   cn_minor                 ACC  0.1795772716 0.5390180412
## 107  cn_minor                ATCC  0.1400616173 0.6329575704
## 122  cn_minor               AAAGG  0.0587759051 0.8418040937
## 138  cn_minor                ACAG  0.3336816629 0.2436471735
## 155  cn_minor AAGTCTGCACACACATACT  0.4065565841 0.1491423268
## 173  cn_minor                ACCT  0.0972248304 0.7409090515
## 192  cn_minor              ACACAG  0.4963592495 0.0710222977
## 212  cn_minor                  AT -0.0755744038 0.7973518835
## 233  cn_minor               AAGAG -0.0494216306 0.8667571276
## 255  cn_minor               AAGGG  0.3050441182 0.2889174740
## 278  cn_minor            AGAGAGGC  0.3803112059 0.1797928011
## 302  cn_minor              ACATAT  0.3498796788 0.2200848007
## 327  cn_minor ACAGACAGCACAGCACAGC  0.0522718809 0.8591406626
## 353  cn_minor                 AGC  0.6080686725 0.0210504418
## 380  cn_minor               AGAGG  0.1666156976 0.5691529772
## 408  cn_minor        AAAGACAGACAG -0.0368941898 0.9003524826
## 437  cn_minor   AAAAACAAGGGAGATAT  0.0251312206 0.9320405579
## 467  cn_minor                AGGC  0.5649979280 0.0352657133
## 498  cn_minor      ACAGACAGCACAGC  0.0603568300 0.8375998717
## 530  cn_minor                AACC -0.1560353328 0.5942553371
## 563  cn_minor               AAAAG -0.3056937031 0.2878396036
## 597  cn_minor              AAAGAG -0.1234548405 0.6741395001
## 632  cn_minor              AGATAT -0.1061813248 0.7178924999
## 668  cn_minor              AGCAGG  0.3087901482 0.2827340345
## 705  cn_minor              ACAGAT -0.0176544987 0.9522335372
## 743  cn_minor              AAAGGG  0.2527080346 0.3833912880
## 782  cn_minor              AACAAG  0.0912374292 0.7564099844
## 822  cn_minor              AAAAAG -0.0687614602 0.8153191057
## 863  cn_minor             AGGAGGC  0.0627222736 0.8313168920
## 905  cn_minor                AAGC  0.4127161514 0.1424868068
## 948  cn_minor              AGAGGG -0.0892099415 0.7616786664
## 992  cn_minor      AGAGAGGCAGAGGC  0.4758936279 0.0854177320
## 1037 cn_minor               AAAGC -0.1754958763 0.5484316170
## 1083 cn_minor              ACAGGC  0.2110608654 0.4688744398
## 1130 cn_minor     AAACAAGGGAGATAT  0.0647303468 0.8259904684
## 1178 cn_minor              ACAGAG -0.0629743315 0.8306479331
## 1227 cn_minor                ACAT  0.3923292393 0.1652928707
## 1277 cn_minor                 ACT -0.0990550271 0.7361886962
## 1328 cn_minor              AGAGAT -0.0816660355 0.7813652633
## 1380 cn_minor                AGGG -0.0718793881 0.8070854414
## 1433 cn_minor              AAAAAC  0.4943928231 0.0723247562
## 1487 cn_minor              AAGAGG  0.1861519545 0.5240036571
## 1542 cn_minor            AAAGGAAG -0.0398925747 0.8922954371
## 1598 cn_minor      AAAAATCTTAAAGG  0.1589578180 0.5872778141
## 1655 cn_minor              ACTGCT  0.0354668725 0.9041910490
## 1713 cn_minor              ACACAT  0.0744723396 0.8002521861
## 1772 cn_minor               AAACC -0.1186387581 0.6862491310
## 1832 cn_minor              AAAACC -0.0002476959 0.9993294795
## 1893 cn_minor             AAAAACC  0.0650178278 0.8252284874
## 1955 cn_minor            ACACATAT  0.0415966961 0.8877204976
## 2018 cn_minor        AGCAGGAGGAGG  0.2382297860 0.4121149488
## 2082 cn_minor              ACATAG  0.1873903226 0.5211966624
res2.2[which(res2.2$row == "AAAAACAAGGGAGATAT" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] # 8 
##                    row   column        cor            p
## 626  AAAAACAAGGGAGATAT   AAAGAG -0.7285579 3.123237e-03
## 734  AAAAACAAGGGAGATAT   ACAGAT -0.6012655 2.294275e-02
## 811  AAAAACAAGGGAGATAT   AACAAG -0.7042463 4.928523e-03
## 977  AAAAACAAGGGAGATAT   AGAGGG -0.6405265 1.359210e-02
## 1516 AAAAACAAGGGAGATAT   AAGAGG -0.8971486 1.364762e-05
## 1571 AAAAACAAGGGAGATAT AAAGGAAG -0.9541723 1.211222e-07
## 1984 AAAAACAAGGGAGATAT ACACATAT -0.7712784 1.237089e-03
## 2111 AAAAACAAGGGAGATAT   ACATAG -0.5698117 3.339801e-02
res2.2[which(res2.2$column == "AAAAACAAGGGAGATAT" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] #10
##        row            column        cor            p
## 440     AG AAAAACAAGGGAGATAT -0.9375549 7.475579e-07
## 441    AAG AAAAACAAGGGAGATAT -0.7855231 8.696458e-04
## 443   AGAT AAAAACAAGGGAGATAT -0.6308160 1.556820e-02
## 445   AAAG AAAAACAAGGGAGATAT -0.8331100 2.155119e-04
## 448 AAGGAG AAAAACAAGGGAGATAT -0.6015439 2.286288e-02
## 449   AAGG AAAAACAAGGGAGATAT -0.8679448 5.725925e-05
## 452  AAAGG AAAAACAAGGGAGATAT -0.9343189 1.005095e-06
## 453   ACAG AAAAACAAGGGAGATAT -0.7646264 1.446444e-03
## 455   ACCT AAAAACAAGGGAGATAT -0.8996703 1.182541e-05
## 458  AAGAG AAAAACAAGGGAGATAT -0.9112247 5.823148e-06
res2.2[which(res2.2$row == "AAAAG" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] # 10
##        row  column        cor            p
## 701  AAAAG  AGCAGG -0.7187996 3.771126e-03
## 738  AAAAG  ACAGAT -0.5874160 2.718718e-02
## 776  AAAAG  AAAGGG -0.5648877 3.530935e-02
## 896  AAAAG AGGAGGC -0.7875134 8.261672e-04
## 1116 AAAAG  ACAGGC -0.7690186 1.305303e-03
## 1310 AAAAG     ACT -0.7020574 5.124267e-03
## 1361 AAAAG  AGAGAT -0.6021450 2.269117e-02
## 1688 AAAAG  ACTGCT -0.7672257 1.361526e-03
## 1746 AAAAG  ACACAT -0.8640085 6.768878e-05
## 2115 AAAAG  ACATAG -0.5438580 4.439291e-02
res2.2[which(res2.2$column == "AAAAG" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] # 8
##                     row column        cor            p
## 565                  AC  AAAAG -0.6445830 0.0128259548
## 568              AACCCT  AAAAG -0.8319098 0.0002243619
## 572                 AGG  AAAAG -0.7348410 0.0027548828
## 576                 ACC  AAAAG -0.8445019 0.0001447274
## 577                ATCC  AAAAG -0.8182549 0.0003474049
## 580 AAGTCTGCACACACATACT  AAAAG -0.7349647 0.0027479929
## 583                  AT  AAAAG -0.5905381 0.0261828839
## 593                AGGC  AAAAG -0.6866443 0.0066804094
res2.2[which(res2.2$row == "AAAAACC" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] #1
##          row   column        cor            p
## 2016 AAAAACC ACACATAT -0.8069037 0.0004867536
nrow(res2.2[which(res2.2$column == "AAAAACC" & res2.2$cor < (-0.5) & res2.2$p < 0.05),]) # 15
## [1] 15
res2.2[which(res2.2$row == "AAAACC" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] #0
## [1] row    column cor    p     
## <0 rows> (or 0-length row.names)
nrow(res2.2[which(res2.2$column == "AAAACC" & res2.2$cor < (-0.5) & res2.2$p < 0.05),]) # 11
## [1] 11